318 files changed, 13616 insertions, 8657 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 814ac4e213a8..0a93dc1cb4ac 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -1,6 +1,6 @@
 config 9P_FS
-        tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+        tristate "Plan 9 Resource Sharing Support (9P2000)"
-        depends on INET && NET_9P && EXPERIMENTAL
+        depends on INET && NET_9P
        help
          If you say Y here, you will get experimental support for
          Plan 9 resource sharing via the 9P2000 protocol.
@@ -10,7 +10,6 @@ config 9P_FS
          If unsure, say N.
 if 9P_FS
 config 9P_FSCACHE
        bool "Enable 9P client caching support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c67703195..8d7f3e69ae29 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
+        dentry_unhash(d);
        return v9fs_remove(i, d, 1);
 }
@@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 82a7c38ddad0..691c78f58bef 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                if (IS_ERR(inode_fid)) {
                        err = PTR_ERR(inode_fid);
                        mutex_unlock(&v9inode->v_mutex);
-                        goto error;
+                        goto err_clunk_old_fid;
                }
                v9inode->writeback_fid = (void *) inode_fid;
        }
@@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        /* Since we are opening a file, assign the open fid to the file */
        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
        if (IS_ERR(filp)) {
-                p9_client_clunk(ofid);
+                err = PTR_ERR(filp);
-                return PTR_ERR(filp);
+                goto err_clunk_old_fid;
        }
        filp->private_data = ofid;
 #ifdef CONFIG_9P_FSCACHE
@@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        return 0;
 error:
-        if (ofid)
-                p9_client_clunk(ofid);
        if (fid)
                p9_client_clunk(fid);
+err_clunk_old_fid:
+        if (ofid)
+                p9_client_clunk(ofid);
        return err;
 }
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b08b228..19891aab9c6e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
        def_bool n
 config EXPORTFS
-        bool
+        tristate
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EXPERT
@@ -124,6 +124,7 @@ config TMPFS
 config TMPFS_POSIX_ACL
        bool "Tmpfs POSIX Access Control Lists"
        depends on TMPFS
+        select TMPFS_XATTR
        select GENERIC_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -134,6 +135,22 @@ config TMPFS_POSIX_ACL
          If you don't know what Access Control Lists are, say N.
+config TMPFS_XATTR
+        bool "Tmpfs extended attributes"
+        depends on TMPFS
+        default n
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).
+          Currently this enables support for the trusted.* and
+          security.* namespaces.
+          You need this for POSIX ACL support on tmpfs.
+          If unsure, say N.
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..03330e2e390c 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
                 dentry->d_inode->i_ino,
                 (int)dentry->d_name.len, dentry->d_name.name);
+        dentry_unhash(dentry);
        return affs_remove_header(dentry);
 }
@@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh = NULL;
        int retval;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
                 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
                 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f24927..2c4e05160042 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
        _enter("{%x:%u},{%s}",
               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+        dentry_unhash(dentry);
        ret = -ENAMETOOLONG;
        if (dentry->d_name.len >= AFSNAMEMAX)
                goto error;
@@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct key *key;
        int ret;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        vnode = AFS_FS_I(old_dentry->d_inode);
        orig_dvnode = AFS_FS_I(old_dir);
        new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..87d95a8cddbc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
                return -EACCES;
+        dentry_unhash(dentry);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
                if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..c7d1d06b0483 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct bfs_sb_info *info;
        int error = -ENOENT;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_bh = new_bh = NULL;
        old_inode = old_dentry->d_inode;
        if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d3057d336..1bffbe0ed778 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        int res;
        char buf[16];
+        memset(&bprm, 0, sizeof(bprm));
        /* Create the file name */
        sprintf(buf, "/lib/lib%d.so", id);
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        if (!bprm.cred)
                goto out;
+        /* We don't really care about recalculating credentials at this point
+         * as we're past the point of no return and are dealing with shared
+         * libraries.
+         */
+        bprm.cred_prepared = 1;
        res = prepare_binprm(&bprm);
        if (!IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 257b00e98428..1f2b19978333 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1120,6 +1120,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                        goto restart;
                                }
                        }
+                        if (!ret && !bdev->bd_openers) {
+                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+                                bdi = blk_get_backing_dev_info(bdev);
+                                if (bdi == NULL)
+                                        bdi = &default_backing_dev_info;
+                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
+                        }
                        /*
                         * If the device is invalidated, rescan partition
                         * if open succeeded or failed with -ENOMEDIUM.
@@ -1130,14 +1139,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                rescan_partitions(disk, bdev);
                        if (ret)
                                goto out_clear;
-                        if (!bdev->bd_openers) {
-                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-                                bdi = blk_get_backing_dev_info(bdev);
-                                if (bdi == NULL)
-                                        bdi = &default_backing_dev_info;
-                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
-                        }
                } else {
                        struct block_device *whole;
                        whole = bdget_disk(disk, 0);
@@ -1237,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
        res = __blkdev_get(bdev, mode, 0);
        if (whole) {
+                struct gendisk *disk = whole->bd_disk;
                /* finish claiming */
                mutex_lock(&bdev->bd_mutex);
                spin_lock(&bdev_lock);
@@ -1263,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
                spin_unlock(&bdev_lock);
                /*
-                 * Block event polling for write claims.  Any write
+                 * Block event polling for write claims if requested.  Any
-                 * holder makes the write_holder state stick until all
+                 * write holder makes the write_holder state stick until
-                 * are released.  This is good enough and tracking
+                 * all are released.  This is good enough and tracking
-                 * individual writeable reference is too fragile given
+                 * individual writeable reference is too fragile given the
-                 * the way @mode is used in blkdev_get/put().
+                 * way @mode is used in blkdev_get/put().
                 */
-                if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+                if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+                    !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
                        bdev->bd_write_holder = true;
-                        disk_block_events(bdev->bd_disk);
+                        disk_block_events(disk);
                }
                mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0e0fe0f6ec75..c5d9fbb92bc3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -10,6 +10,8 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
@@ -1969,6 +1971,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
+        if (!PageUptodate(page)) {
+                if (cleancache_get_page(page) == 0) {
+                        BUG_ON(blocksize != PAGE_SIZE);
+                        goto out;
+                }
+        }
        end = page_end;
        while (1) {
                lock_extent(tree, start, end, GFP_NOFS);
@@ -2102,6 +2111,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                cur = cur + iosize;
                pg_offset += iosize;
        }
+out:
        if (!nr) {
                if (!PageError(page))
                        SetPageUptodate(page);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fa2c5d87f219..ca38eca70af0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -710,7 +710,7 @@ again:
        WARN_ON(cur->checked);
        if (!list_empty(&cur->upper)) {
                /*
-                 * the backref was added previously when processsing
+                 * the backref was added previously when processing
                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
                 */
                BUG_ON(!list_is_singular(&cur->upper));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 28e3cb2607ff..9b2e7e5bc3ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,6 +39,7 @@
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/cleancache.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -632,6 +633,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_root = root_dentry;
        save_mount_options(sb, data);
+        cleancache_init_fs(sb);
        return 0;
 fail_close:
diff --git a/fs/buffer.c b/fs/buffer.c
index a08bb8e61c6f..698c6b2cc462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
        invalidate_bh_lrus();
        lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
+        /* 99% of the time, we don't need to flush the cleancache on the bdev.
+         * But, for the strange corners, lets be cautious
+         */
+        cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -2331,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
 */
-int
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                         get_block_t get_block)
-                   get_block_t get_block)
 {
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        unsigned long end;
        loff_t size;
-        int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+        int ret;
        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
-                /* page got truncated out from underneath us */
+                /* We overload EFAULT to mean page got truncated */
-                unlock_page(page);
+                ret = -EFAULT;
-                goto out;
+                goto out_unlock;
        }
        /* page is wholly or partially inside EOF */
@@ -2361,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (!ret)
                ret = block_commit_write(page, 0, end);
-        if (unlikely(ret)) {
+        if (unlikely(ret < 0))
-                unlock_page(page);
+                goto out_unlock;
-                if (ret == -ENOMEM)
+        /*
-                        ret = VM_FAULT_OOM;
+         * Freezing in progress? We check after the page is marked dirty and
-                else /* -ENOSPC, -EIO, etc */
+         * with page lock held so if the test here fails, we are sure freezing
-                        ret = VM_FAULT_SIGBUS;
+         * code will wait during syncing until the page fault is done - at that
-        } else
+         * point page will be dirty and unlocked so freezing code will write it
-                ret = VM_FAULT_LOCKED;
+         * and writeprotect it again.
+         */
-out:
+        set_page_dirty(page);
+        if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+                ret = -EAGAIN;
+                goto out_unlock;
+        }
+        return 0;
+out_unlock:
+        unlock_page(page);
        return ret;
 }
+EXPORT_SYMBOL(__block_page_mkwrite);
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                   get_block_t get_block)
+{
+        int ret;
+        struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+        /*
+         * This check is racy but catches the common case. The check in
+         * __block_page_mkwrite() is reliable.
+         */
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
+        ret = __block_page_mkwrite(vma, vmf, get_block);
+        return block_page_mkwrite_return(ret);
+}
 EXPORT_SYMBOL(block_page_mkwrite);
 /*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b8ab554924..33da49dc3cc6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -848,7 +848,8 @@ get_more_pages:
                op->payload_len = cpu_to_le32(len);
                req->r_request->hdr.data_len = cpu_to_le32(len);
-                ceph_osdc_start_request(&fsc->client->osdc, req, true);
+                rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+                BUG_ON(rc);
                req = NULL;
                /* continue? */
@@ -880,8 +881,6 @@ release_pvec_pages:
 out:
        if (req)
                ceph_osdc_put_request(req);
-        if (rc > 0)
-                rc = 0;  /* vfs expects us to return 0 */
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
        return rc;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a5404c1c42f..1f72b00447c4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -569,7 +569,8 @@ retry:
                list_add_tail(&cap->session_caps, &session->s_caps);
                session->s_nr_caps++;
                spin_unlock(&session->s_cap_lock);
-        }
+        } else if (new_cap)
+                ceph_put_cap(mdsc, new_cap);
        if (!ci->i_snap_realm) {
                /*
@@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                              struct ceph_mds_session *session,
                              int *open_target_sessions)
 {
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
        unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                         * export targets, so that we get the matching IMPORT
                         */
                        *open_target_sessions = 1;
+                        /*
+                         * we can't flush dirty caps that we've seen the
+                         * EXPORT but no IMPORT for
+                         */
+                        spin_lock(&mdsc->cap_dirty_lock);
+                        if (!list_empty(&ci->i_dirty_item)) {
+                                dout(" moving %p to cap_dirty_migrating\n",
+                                     inode);
+                                list_move(&ci->i_dirty_item,
+                                          &mdsc->cap_dirty_migrating);
+                        }
+                        spin_unlock(&mdsc->cap_dirty_lock);
                }
                __ceph_remove_cap(cap);
        }
@@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
                ci->i_cap_exporting_issued = 0;
                ci->i_cap_exporting_mseq = 0;
                ci->i_cap_exporting_mds = -1;
+                spin_lock(&mdsc->cap_dirty_lock);
+                if (!list_empty(&ci->i_dirty_item)) {
+                        dout(" moving %p back to cap_dirty\n", inode);
+                        list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
        } else {
                dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
                     inode, ci, mds, mseq);
@@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 */
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 {
-        struct ceph_inode_info *ci, *nci = NULL;
+        struct ceph_inode_info *ci;
-        struct inode *inode, *ninode = NULL;
+        struct inode *inode;
-        struct list_head *p, *n;
        dout("flush_dirty_caps\n");
        spin_lock(&mdsc->cap_dirty_lock);
-        list_for_each_safe(p, n, &mdsc->cap_dirty) {
+        while (!list_empty(&mdsc->cap_dirty)) {
-                if (nci) {
+                ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
-                        ci = nci;
+                                      i_dirty_item);
-                        inode = ninode;
+                inode = igrab(&ci->vfs_inode);
-                        ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+                dout("flush_dirty_caps %p\n", inode);
-                        dout("flush_dirty_caps inode %p (was next inode)\n",
-                             inode);
-                } else {
-                        ci = list_entry(p, struct ceph_inode_info,
-                                        i_dirty_item);
-                        inode = igrab(&ci->vfs_inode);
-                        BUG_ON(!inode);
-                        dout("flush_dirty_caps inode %p\n", inode);
-                }
-                if (n != &mdsc->cap_dirty) {
-                        nci = list_entry(n, struct ceph_inode_info,
-                                         i_dirty_item);
-                        ninode = igrab(&nci->vfs_inode);
-                        BUG_ON(!ninode);
-                        nci->i_ceph_flags |= CEPH_I_NOFLUSH;
-                        dout("flush_dirty_caps next inode %p, noflush\n",
-                             ninode);
-                } else {
-                        nci = NULL;
-                        ninode = NULL;
-                }
                spin_unlock(&mdsc->cap_dirty_lock);
                if (inode) {
                        ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
@@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
                spin_lock(&mdsc->cap_dirty_lock);
        }
        spin_unlock(&mdsc->cap_dirty_lock);
+        dout("flush_dirty_caps done\n");
 }
 /*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3601ae..33729e822bb9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -360,7 +360,7 @@ more:
        rinfo = &fi->last_readdir->r_reply_info;
        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
             rinfo->dir_nr, off, fi->offset);
-        while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+        while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
@@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
+        const int bufsize = 1024;
        if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
-                cf->dir_info = kmalloc(1024, GFP_NOFS);
+                cf->dir_info = kmalloc(bufsize, GFP_NOFS);
                if (!cf->dir_info)
                        return -ENOMEM;
                cf->dir_info_len =
-                        sprintf(cf->dir_info,
+                        snprintf(cf->dir_info, bufsize,
                                "entries:   %20lld\n"
                                " files:    %20lld\n"
                                " subdirs:  %20lld\n"
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e41056174bf8..a610d3d67488 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 static struct dentry *__fh_to_dentry(struct super_block *sb,
                                     struct ceph_nfs_fh *fh)
 {
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
@@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
        vino.ino = fh->ino;
        vino.snap = CEPH_NOSNAP;
        inode = ceph_find_inode(sb, vino);
-        if (!inode)
+        if (!inode) {
-                return ERR_PTR(-ESTALE);
+                struct ceph_mds_request *req;
+                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+                                               USE_ANY_MDS);
+                if (IS_ERR(req))
+                        return ERR_CAST(req);
+                req->r_ino1 = vino;
+                req->r_num_caps = 1;
+                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                inode = req->r_target_inode;
+                if (inode)
+                        igrab(inode);
+                ceph_mdsc_put_request(req);
+                if (!inode)
+                        return ERR_PTR(-ESTALE);
+        }
        dentry = d_obtain_alias(inode);
        if (IS_ERR(dentry)) {
@@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
                snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
                req->r_num_caps = 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                inode = req->r_target_inode;
+                if (inode)
+                        igrab(inode);
                ceph_mdsc_put_request(req);
-                inode = ceph_find_inode(sb, vino);
                if (!inode)
                        return ERR_PTR(err ? err : -ESTALE);
        }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d0fae4ce9ba5..79743d146be6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
+                ihold(dir);
                spin_lock(&ci->i_unsafe_lock);
                req->r_unsafe_dir = dir;
                list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
@@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_dir_item);
                spin_unlock(&ci->i_unsafe_lock);
+                iput(req->r_unsafe_dir);
+                req->r_unsafe_dir = NULL;
        }
        ceph_mdsc_put_request(req);
@@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 {
        struct super_block *sb = mdsc->fsc->sb;
        struct inode *inode;
-        struct ceph_inode_info *ci;
        struct dentry *parent, *dentry;
        struct ceph_dentry_info *di;
        int mds = session->s_mds;
@@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                dout("handle_lease no inode %llx\n", vino.ino);
                goto release;
        }
-        ci = ceph_inode(inode);
        /* dentry */
        parent = d_find_alias(inode);
@@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        spin_lock_init(&mdsc->snap_flush_lock);
        mdsc->cap_flush_seq = 0;
        INIT_LIST_HEAD(&mdsc->cap_dirty);
+        INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
        mdsc->num_cap_flushing = 0;
        spin_lock_init(&mdsc->cap_dirty_lock);
        init_waitqueue_head(&mdsc->cap_flushing_wq);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e3a9cc0bba6..7d8a0d662d56 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -278,6 +278,7 @@ struct ceph_mds_client {
        u64               cap_flush_seq;
        struct list_head  cap_dirty;        /* inodes with dirty caps */
+        struct list_head  cap_dirty_migrating; /* ...that are migration... */
        int               num_cap_flushing; /* # caps we are flushing */
        spinlock_t        cap_dirty_lock;   /* protects above items */
        wait_queue_head_t cap_flushing_wq;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 7cb0f7f847e4..1cd4c3a1862d 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -7,6 +7,7 @@ config CIFS
        select CRYPTO_MD5
        select CRYPTO_HMAC
        select CRYPTO_ARC4
+        select CRYPTO_DES
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
@@ -152,16 +153,8 @@ config CIFS_ACL
            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
            is handed over to the application/caller.
-config CIFS_EXPERIMENTAL
+config CIFS_NFSD_EXPORT
-          bool "CIFS Experimental Features (EXPERIMENTAL)"
+          bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
          help
-            Enables cifs features under testing. These features are
+           Allows NFS server to export a CIFS mounted share (nfsd over cifs)
-            experimental and currently include DFS support and directory
-            change notification ie fcntl(F_DNOTIFY), as well as the upcall
-            mechanism which will be used for Kerberos session negotiation
-            and uid remapping.  Some of these features also may depend on
-            setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
-            (which is disabled by default). See the file fs/cifs/README
-            for more details.  If unsure, say N.
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index d87558448e3d..005d524c3a4a 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
-          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
+          link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
          cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
          readdir.o ioctl.o sess.o export.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 74ab165fc646..c5c2c5e5f0f2 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -457,6 +457,9 @@ A partial list of the supported mount options follows:
                otherwise - read from the server. All written data are stored
                in the cache, but if the client doesn't have Exclusive Oplock,
                it writes the data to the server.
+  rwpidforward  Forward pid of a process who opened a file to any read or write
+                operation on that file. This prevent applications like WINE
+                from failing on read and write if we use mandatory brlock style.
  acl           Allow setfacl and getfacl to manage posix ACLs if server
                supports them.  (default)
  noacl         Do not allow setfacl and getfacl calls on this mount
@@ -704,18 +707,6 @@ the start of smb requests and responses can be enabled via:
        echo 1 > /proc/fs/cifs/traceSMB
-Two other experimental features are under development. To test these
-requires enabling CONFIG_CIFS_EXPERIMENTAL
-        cifsacl support needed to retrieve approximated mode bits based on
-                the contents on the CIFS ACL.
-        lease support: cifs will check the oplock state before calling into
-        the vfs to see if we can grant a lease on a file.
-        DNOTIFY fcntl: needed for support of directory change 
-                            notification and perhaps later for file leases)
 Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
 if the kernel was configured with cifs statistics enabled.  The statistics
 represent the number of successful (ie non-zero return code from the server) 
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 53d57a3fe427..dd8584d35a14 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -146,7 +146,7 @@ static char *extract_sharename(const char *treename)
 static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
                                   uint16_t maxbuf)
 {
-        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        const struct cifs_tcon *tcon = cookie_netfs_data;
        char *sharename;
        uint16_t len;
@@ -173,7 +173,7 @@ cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
                           uint16_t maxbuf)
 {
        struct cifs_fscache_super_auxdata auxdata;
-        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        const struct cifs_tcon *tcon = cookie_netfs_data;
        memset(&auxdata, 0, sizeof(auxdata));
        auxdata.resource_id = tcon->resource_id;
@@ -192,7 +192,7 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
                                              uint16_t datalen)
 {
        struct cifs_fscache_super_auxdata auxdata;
-        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        const struct cifs_tcon *tcon = cookie_netfs_data;
        if (datalen != sizeof(auxdata))
                return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 30d01bc90855..2fe3cf13b2e9 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -63,7 +63,7 @@ void cifs_dump_detail(struct smb_hdr *smb)
        cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
                  smb->Command, smb->Status.CifsError,
                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-        cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
+        cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb));
 }
@@ -110,8 +110,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
        struct list_head *tmp1, *tmp2, *tmp3;
        struct mid_q_entry *mid_entry;
        struct TCP_Server_Info *server;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        int i, j;
        __u32 dev_type;
@@ -152,7 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                    tcp_ses_list);
                i++;
                list_for_each(tmp2, &server->smb_ses_list) {
-                        ses = list_entry(tmp2, struct cifsSesInfo,
+                        ses = list_entry(tmp2, struct cifs_ses,
                                         smb_ses_list);
                        if ((ses->serverDomain == NULL) ||
                                (ses->serverOS == NULL) ||
@@ -171,7 +171,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                        seq_printf(m, "TCP status: %d\n\tLocal Users To "
                                   "Server: %d SecMode: 0x%x Req On Wire: %d",
                                   server->tcpStatus, server->srv_count,
-                                   server->secMode,
+                                   server->sec_mode,
                                   atomic_read(&server->inFlight));
 #ifdef CONFIG_CIFS_STATS2
@@ -183,7 +183,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                        seq_puts(m, "\n\tShares:");
                        j = 0;
                        list_for_each(tmp3, &ses->tcon_list) {
-                                tcon = list_entry(tmp3, struct cifsTconInfo,
+                                tcon = list_entry(tmp3, struct cifs_tcon,
                                                  tcon_list);
                                ++j;
                                dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
@@ -256,8 +256,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
        int rc;
        struct list_head *tmp1, *tmp2, *tmp3;
        struct TCP_Server_Info *server;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        rc = get_user(c, buffer);
        if (rc)
@@ -273,11 +273,11 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                        server = list_entry(tmp1, struct TCP_Server_Info,
                                            tcp_ses_list);
                        list_for_each(tmp2, &server->smb_ses_list) {
-                                ses = list_entry(tmp2, struct cifsSesInfo,
+                                ses = list_entry(tmp2, struct cifs_ses,
                                                 smb_ses_list);
                                list_for_each(tmp3, &ses->tcon_list) {
                                        tcon = list_entry(tmp3,
-                                                          struct cifsTconInfo,
+                                                          struct cifs_tcon,
                                                          tcon_list);
                                        atomic_set(&tcon->num_smbs_sent, 0);
                                        atomic_set(&tcon->num_writes, 0);
@@ -312,8 +312,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
        int i;
        struct list_head *tmp1, *tmp2, *tmp3;
        struct TCP_Server_Info *server;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        seq_printf(m,
                        "Resources in use\nCIFS Session: %d\n",
@@ -346,11 +346,11 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                server = list_entry(tmp1, struct TCP_Server_Info,
                                    tcp_ses_list);
                list_for_each(tmp2, &server->smb_ses_list) {
-                        ses = list_entry(tmp2, struct cifsSesInfo,
+                        ses = list_entry(tmp2, struct cifs_ses,
                                         smb_ses_list);
                        list_for_each(tmp3, &ses->tcon_list) {
                                tcon = list_entry(tmp3,
-                                                  struct cifsTconInfo,
+                                                  struct cifs_tcon,
                                                  tcon_list);
                                i++;
                                seq_printf(m, "\n%d) %s", i, tcon->treeName);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 2b68ac57d97d..8d8f28c94c0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -272,7 +272,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
        struct cifs_sb_info *cifs_sb;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        char *full_path;
        int xid, i;
        int rc;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index ac51cd2d33ae..ffb1459dc6ec 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -41,6 +41,7 @@
 #define CIFS_MOUNT_MF_SYMLINKS  0x10000 /* Minshall+French Symlinks enabled */
 #define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
 #define CIFS_MOUNT_STRICT_IO    0x40000 /* strict cache mode */
+#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
 struct cifs_sb_info {
        struct rb_root tlink_tree;
@@ -56,11 +57,7 @@ struct cifs_sb_info {
        mode_t  mnt_file_mode;
        mode_t  mnt_dir_mode;
        unsigned int mnt_cifs_flags;
-        int     prepathlen;
+        char   *mountdata; /* options received at mount time or via DFS refs */
-        char   *prepath; /* relative path under the share to mount to */
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        char   *mountdata; /* mount options received at mount time */
-#endif
        struct backing_dev_info bdi;
        struct delayed_work prune_tlinks;
 };
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 33d221394aca..2272fd5fe5b7 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -95,7 +95,7 @@ struct key_type cifs_spnego_key_type = {
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
-cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
+cifs_get_spnego_key(struct cifs_ses *sesInfo)
 {
        struct TCP_Server_Info *server = sesInfo->server;
        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index e4041ec4d712..31bef9ee078b 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -41,7 +41,7 @@ struct cifs_spnego_msg {
 #ifdef __KERNEL__
 extern struct key_type cifs_spnego_key_type;
-extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo);
+extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo);
 #endif /* KERNEL */
 #endif /* _CIFS_SPNEGO_H */
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 644dd882a560..6d02fd560566 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -82,6 +82,9 @@ int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
 char *cifs_strndup_from_ucs(const char *src, const int maxlen,
                            const bool is_unicode,
                            const struct nls_table *codepage);
+extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+                        const struct nls_table *cp, int mapChars);
 #endif
 /*
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index beeebf194234..5f02b4ee9a03 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -23,77 +23,404 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
-static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
-        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
-        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
-;
 /* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
        1, 1, {0, 0, 0, 0, 0, 1}, {0} };
 /* security id for Authenticated Users system group */
 static const struct cifs_sid sid_authusers = {
-        1, 1, {0, 0, 0, 0, 0, 5}, {11} };
+        1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
+const struct cred *root_cred;
-int match_sid(struct cifs_sid *ctsid)
+static void
+shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
+                        int *nr_del)
 {
-        int i, j;
+        struct rb_node *node;
-        int num_subauth, num_sat, num_saw;
+        struct rb_node *tmp;
-        struct cifs_sid *cwsid;
+        struct cifs_sid_id *psidid;
+        node = rb_first(root);
+        while (node) {
+                tmp = node;
+                node = rb_next(tmp);
+                psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
+                if (nr_to_scan == 0 || *nr_del == nr_to_scan)
+                        ++(*nr_rem);
+                else {
+                        if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
+                                                && psidid->refcount == 0) {
+                                rb_erase(tmp, root);
+                                ++(*nr_del);
+                        } else
+                                ++(*nr_rem);
+                }
+        }
+}
+/*
+ * Run idmap cache shrinker.
+ */
+static int
+cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+{
+        int nr_del = 0;
+        int nr_rem = 0;
+        struct rb_root *root;
+        root = &uidtree;
+        spin_lock(&siduidlock);
+        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+        spin_unlock(&siduidlock);
+        root = &gidtree;
+        spin_lock(&sidgidlock);
+        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+        spin_unlock(&sidgidlock);
+        return nr_rem;
+}
+static struct shrinker cifs_shrinker = {
+        .shrink = cifs_idmap_shrinker,
+        .seeks = DEFAULT_SEEKS,
+};
+static int
+cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+        char *payload;
+        payload = kmalloc(datalen, GFP_KERNEL);
+        if (!payload)
+                return -ENOMEM;
+        memcpy(payload, data, datalen);
+        key->payload.data = payload;
+        return 0;
+}
+static inline void
+cifs_idmap_key_destroy(struct key *key)
+{
+        kfree(key->payload.data);
+}
-        if (!ctsid)
+struct key_type cifs_idmap_key_type = {
-                return -1;
+        .name        = "cifs.idmap",
+        .instantiate = cifs_idmap_key_instantiate,
+        .destroy     = cifs_idmap_key_destroy,
+        .describe    = user_describe,
+        .match       = user_match,
+};
+static void
+sid_to_str(struct cifs_sid *sidptr, char *sidstr)
+{
+        int i;
+        unsigned long saval;
+        char *strptr;
-        for (i = 0; i < NUM_WK_SIDS; ++i) {
+        strptr = sidstr;
-                cwsid = &(wksidarr[i].cifssid);
-                /* compare the revision */
+        sprintf(strptr, "%s", "S");
-                if (ctsid->revision != cwsid->revision)
+        strptr = sidstr + strlen(sidstr);
-                        continue;
-                /* compare all of the six auth values */
+        sprintf(strptr, "-%d", sidptr->revision);
-                for (j = 0; j < 6; ++j) {
+        strptr = sidstr + strlen(sidstr);
-                        if (ctsid->authority[j] != cwsid->authority[j])
-                                break;
+        for (i = 0; i < 6; ++i) {
+                if (sidptr->authority[i]) {
+                        sprintf(strptr, "-%d", sidptr->authority[i]);
+                        strptr = sidstr + strlen(sidstr);
                }
-                if (j < 6)
+        }
-                        continue; /* all of the auth values did not match */
+        for (i = 0; i < sidptr->num_subauth; ++i) {
-                /* compare all of the subauth values if any */
+                saval = le32_to_cpu(sidptr->sub_auth[i]);
-                num_sat = ctsid->num_subauth;
+                sprintf(strptr, "-%ld", saval);
-                num_saw = cwsid->num_subauth;
+                strptr = sidstr + strlen(sidstr);
-                num_subauth = num_sat < num_saw ? num_sat : num_saw;
+        }
-                if (num_subauth) {
+}
-                        for (j = 0; j < num_subauth; ++j) {
-                                if (ctsid->sub_auth[j] != cwsid->sub_auth[j])
+static void
-                                        break;
+id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
-                        }
+                struct cifs_sid_id **psidid, char *typestr)
-                        if (j < num_subauth)
+{
-                                continue; /* all sub_auth values do not match */
+        int rc;
+        char *strptr;
+        struct rb_node *node = root->rb_node;
+        struct rb_node *parent = NULL;
+        struct rb_node **linkto = &(root->rb_node);
+        struct cifs_sid_id *lsidid;
+        while (node) {
+                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+                parent = node;
+                rc = compare_sids(sidptr, &((lsidid)->sid));
+                if (rc > 0) {
+                        linkto = &(node->rb_left);
+                        node = node->rb_left;
+                } else if (rc < 0) {
+                        linkto = &(node->rb_right);
+                        node = node->rb_right;
+                }
+        }
+        memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
+        (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+        (*psidid)->refcount = 0;
+        sprintf((*psidid)->sidstr, "%s", typestr);
+        strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+        sid_to_str(&(*psidid)->sid, strptr);
+        clear_bit(SID_ID_PENDING, &(*psidid)->state);
+        clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+        rb_link_node(&(*psidid)->rbnode, parent, linkto);
+        rb_insert_color(&(*psidid)->rbnode, root);
+}
+static struct cifs_sid_id *
+id_rb_search(struct rb_root *root, struct cifs_sid *sidptr)
+{
+        int rc;
+        struct rb_node *node = root->rb_node;
+        struct cifs_sid_id *lsidid;
+        while (node) {
+                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+                rc = compare_sids(sidptr, &((lsidid)->sid));
+                if (rc > 0) {
+                        node = node->rb_left;
+                } else if (rc < 0) {
+                        node = node->rb_right;
+                } else /* node found */
+                        return lsidid;
+        }
+        return NULL;
+}
+static int
+sidid_pending_wait(void *unused)
+{
+        schedule();
+        return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+static int
+sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+                struct cifs_fattr *fattr, uint sidtype)
+{
+        int rc;
+        unsigned long cid;
+        struct key *idkey;
+        const struct cred *saved_cred;
+        struct cifs_sid_id *psidid, *npsidid;
+        struct rb_root *cidtree;
+        spinlock_t *cidlock;
+        if (sidtype == SIDOWNER) {
+                cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
+                cidlock = &siduidlock;
+                cidtree = &uidtree;
+        } else if (sidtype == SIDGROUP) {
+                cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
+                cidlock = &sidgidlock;
+                cidtree = &gidtree;
+        } else
+                return -ENOENT;
+        spin_lock(cidlock);
+        psidid = id_rb_search(cidtree, psid);
+        if (!psidid) { /* node does not exist, allocate one & attempt adding */
+                spin_unlock(cidlock);
+                npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+                if (!npsidid)
+                        return -ENOMEM;
+                npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+                if (!npsidid->sidstr) {
+                        kfree(npsidid);
+                        return -ENOMEM;
+                }
+                spin_lock(cidlock);
+                psidid = id_rb_search(cidtree, psid);
+                if (psidid) { /* node happened to get inserted meanwhile */
+                        ++psidid->refcount;
+                        spin_unlock(cidlock);
+                        kfree(npsidid->sidstr);
+                        kfree(npsidid);
+                } else {
+                        psidid = npsidid;
+                        id_rb_insert(cidtree, psid, &psidid,
+                                        sidtype == SIDOWNER ? "os:" : "gs:");
+                        ++psidid->refcount;
+                        spin_unlock(cidlock);
                }
+        } else {
+                ++psidid->refcount;
+                spin_unlock(cidlock);
+        }
+        /*
+         * If we are here, it is safe to access psidid and its fields
+         * since a reference was taken earlier while holding the spinlock.
+         * A reference on the node is put without holding the spinlock
+         * and it is OK to do so in this case, shrinker will not erase
+         * this node until all references are put and we do not access
+         * any fields of the node after a reference is put .
+         */
+        if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+                cid = psidid->id;
+                psidid->time = jiffies; /* update ts for accessing */
+                goto sid_to_id_out;
+        }
-                cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
+        if (time_after(psidid->time + SID_MAP_RETRY, jiffies))
-                return 0; /* sids compare/match */
+                goto sid_to_id_out;
+        if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+                saved_cred = override_creds(root_cred);
+                idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+                if (IS_ERR(idkey))
+                        cFYI(1, "%s: Can't map SID to an id", __func__);
+                else {
+                        cid = *(unsigned long *)idkey->payload.value;
+                        psidid->id = cid;
+                        set_bit(SID_ID_MAPPED, &psidid->state);
+                        key_put(idkey);
+                        kfree(psidid->sidstr);
+                }
+                revert_creds(saved_cred);
+                psidid->time = jiffies; /* update ts for accessing */
+                clear_bit(SID_ID_PENDING, &psidid->state);
+                wake_up_bit(&psidid->state, SID_ID_PENDING);
+        } else {
+                rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+                                sidid_pending_wait, TASK_INTERRUPTIBLE);
+                if (rc) {
+                        cFYI(1, "%s: sidid_pending_wait interrupted %d",
+                                        __func__, rc);
+                        --psidid->refcount; /* decremented without spinlock */
+                        return rc;
+                }
+                if (test_bit(SID_ID_MAPPED, &psidid->state))
+                        cid = psidid->id;
        }
-        cFYI(1, "No matching sid");
+sid_to_id_out:
-        return -1;
+        --psidid->refcount; /* decremented without spinlock */
+        if (sidtype == SIDOWNER)
+                fattr->cf_uid = cid;
+        else
+                fattr->cf_gid = cid;
+        return 0;
+}
+int
+init_cifs_idmap(void)
+{
+        struct cred *cred;
+        struct key *keyring;
+        int ret;
+        cFYI(1, "Registering the %s key type\n", cifs_idmap_key_type.name);
+        /* create an override credential set with a special thread keyring in
+         * which requests are cached
+         *
+         * this is used to prevent malicious redirections from being installed
+         * with add_key().
+         */
+        cred = prepare_kernel_cred(NULL);
+        if (!cred)
+                return -ENOMEM;
+        keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
+                            (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                            KEY_USR_VIEW | KEY_USR_READ,
+                            KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(keyring)) {
+                ret = PTR_ERR(keyring);
+                goto failed_put_cred;
+        }
+        ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+        if (ret < 0)
+                goto failed_put_key;
+        ret = register_key_type(&cifs_idmap_key_type);
+        if (ret < 0)
+                goto failed_put_key;
+        /* instruct request_key() to use this special keyring as a cache for
+         * the results it looks up */
+        cred->thread_keyring = keyring;
+        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+        root_cred = cred;
+        spin_lock_init(&siduidlock);
+        uidtree = RB_ROOT;
+        spin_lock_init(&sidgidlock);
+        gidtree = RB_ROOT;
+        register_shrinker(&cifs_shrinker);
+        cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
+        return 0;
+failed_put_key:
+        key_put(keyring);
+failed_put_cred:
+        put_cred(cred);
+        return ret;
+}
+void
+exit_cifs_idmap(void)
+{
+        key_revoke(root_cred->thread_keyring);
+        unregister_key_type(&cifs_idmap_key_type);
+        put_cred(root_cred);
+        unregister_shrinker(&cifs_shrinker);
+        cFYI(1, "Unregistered %s key type\n", cifs_idmap_key_type.name);
+}
+void
+cifs_destroy_idmaptrees(void)
+{
+        struct rb_root *root;
+        struct rb_node *node;
+        root = &uidtree;
+        spin_lock(&siduidlock);
+        while ((node = rb_first(root)))
+                rb_erase(node, root);
+        spin_unlock(&siduidlock);
+        root = &gidtree;
+        spin_lock(&sidgidlock);
+        while ((node = rb_first(root)))
+                rb_erase(node, root);
+        spin_unlock(&sidgidlock);
 }
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -104,16 +431,24 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
        int num_subauth, num_sat, num_saw;
        if ((!ctsid) || (!cwsid))
-                return 0;
+                return 1;
        /* compare the revision */
-        if (ctsid->revision != cwsid->revision)
+        if (ctsid->revision != cwsid->revision) {
-                return 0;
+                if (ctsid->revision > cwsid->revision)
+                        return 1;
+                else
+                        return -1;
+        }
        /* compare all of the six auth values */
        for (i = 0; i < 6; ++i) {
-                if (ctsid->authority[i] != cwsid->authority[i])
+                if (ctsid->authority[i] != cwsid->authority[i]) {
-                        return 0;
+                        if (ctsid->authority[i] > cwsid->authority[i])
+                                return 1;
+                        else
+                                return -1;
+                }
        }
        /* compare all of the subauth values if any */
@@ -122,12 +457,17 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
        num_subauth = num_sat < num_saw ? num_sat : num_saw;
        if (num_subauth) {
                for (i = 0; i < num_subauth; ++i) {
-                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
+                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
-                                return 0;
+                                if (le32_to_cpu(ctsid->sub_auth[i]) >
+                                        le32_to_cpu(cwsid->sub_auth[i]))
+                                        return 1;
+                                else
+                                        return -1;
+                        }
                }
        }
-        return 1; /* sids compare/match */
+        return 0; /* sids compare/match */
 }
@@ -382,22 +722,22 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 #ifdef CONFIG_CIFS_DEBUG2
                        dump_ace(ppace[i], end_of_acl);
 #endif
-                        if (compare_sids(&(ppace[i]->sid), pownersid))
+                        if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
                                                     &fattr->cf_mode,
                                                     &user_mask);
-                        if (compare_sids(&(ppace[i]->sid), pgrpsid))
+                        if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
                                                     &fattr->cf_mode,
                                                     &group_mask);
-                        if (compare_sids(&(ppace[i]->sid), &sid_everyone))
+                        if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
                                                     &fattr->cf_mode,
                                                     &other_mask);
-                        if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+                        if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
                                                     &fattr->cf_mode,
@@ -475,10 +815,10 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 /* Convert CIFS ACL to POSIX form */
-static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
+static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
-                          struct cifs_fattr *fattr)
+                struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr)
 {
-        int rc;
+        int rc = 0;
        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
        struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
        char *end_of_acl = ((char *)pntsd) + acl_len;
@@ -500,12 +840,26 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*      cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
        rc = parse_sid(owner_sid_ptr, end_of_acl);
-        if (rc)
+        if (rc) {
+                cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc);
+                return rc;
+        }
+        rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
+        if (rc) {
+                cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc);
                return rc;
+        }
        rc = parse_sid(group_sid_ptr, end_of_acl);
-        if (rc)
+        if (rc) {
+                cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc);
                return rc;
+        }
+        rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
+        if (rc) {
+                cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc);
+                return rc;
+        }
        if (dacloffset)
                parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
@@ -520,7 +874,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
        memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
                        sizeof(struct cifs_sid)); */
-        return 0;
+        return rc;
 }
@@ -592,7 +946,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -660,7 +1014,7 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -688,7 +1042,7 @@ out:
 }
 /* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
                                struct inode *inode, const char *path)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -727,7 +1081,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                rc = PTR_ERR(pntsd);
                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
        } else {
-                rc = parse_sec_desc(pntsd, acllen, fattr);
+                rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
                kfree(pntsd);
                if (rc)
                        cERROR(1, "parse sec desc failed rc = %d", rc);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index c4ae7d036563..5c902c7ce524 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -39,6 +39,15 @@
 #define ACCESS_ALLOWED  0
 #define ACCESS_DENIED   1
+#define SIDOWNER 1
+#define SIDGROUP 2
+#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
+#define SID_ID_MAPPED 0
+#define SID_ID_PENDING 1
+#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */
+#define SID_MAP_RETRY (300 * HZ)   /* wait 5 minutes for next attempt to map */
 struct cifs_ntsd {
        __le16 revision; /* revision level */
        __le16 type;
@@ -74,7 +83,21 @@ struct cifs_wksid {
        char sidname[SIDNAMELENGTH];
 } __attribute__((packed));
-extern int match_sid(struct cifs_sid *);
+struct cifs_sid_id {
+        unsigned int refcount; /* increment with spinlock, decrement without */
+        unsigned long id;
+        unsigned long time;
+        unsigned long state;
+        char *sidstr;
+        struct rb_node rbnode;
+        struct cifs_sid sid;
+};
+#ifdef __KERNEL__
+extern struct key_type cifs_idmap_key_type;
+extern const struct cred *root_cred;
+#endif /* KERNEL */
 extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d1a016be73ba..dfbd9f1f373d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -60,7 +60,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
                server->session_key.response, server->session_key.len);
        crypto_shash_update(&server->secmech.sdescmd5->shash,
-                cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+                cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
@@ -229,7 +229,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifsSesInfo *ses)
+int setup_ntlm_response(struct cifs_ses *ses)
 {
        int rc = 0;
        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -268,10 +268,11 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
 }
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
+int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
                        char *lnm_session_key)
 {
        int i;
+        int rc;
        char password_with_pad[CIFS_ENCPWD_SIZE];
        memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
@@ -282,7 +283,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
                memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                memcpy(lnm_session_key, password_with_pad,
                        CIFS_ENCPWD_SIZE);
-                return;
+                return 0;
        }
        /* calculate old style session key */
@@ -299,10 +300,9 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
        for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
                password_with_pad[i] = toupper(password_with_pad[i]);
-        SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
+        rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
-        /* clear password before we return/free memory */
+        return rc;
-        memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
 }
 #endif /* CIFS_WEAK_PW_HASH */
@@ -312,7 +312,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 * Allocate domain name which gets freed when session struct is deallocated.
 */
 static int
-build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        unsigned int dlen;
        unsigned int wlen;
@@ -400,7 +400,7 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 * about target string i.e. for some, just user name might suffice.
 */
 static int
-find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        unsigned int attrsize;
        unsigned int type;
@@ -445,7 +445,7 @@ find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
        return 0;
 }
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
+static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                            const struct nls_table *nls_cp)
 {
        int rc = 0;
@@ -527,7 +527,7 @@ calc_exit_2:
 }
 static int
-CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
+CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 {
        int rc;
        unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
@@ -563,7 +563,7 @@ CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
 int
-setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        int rc;
        int baselen;
@@ -649,7 +649,7 @@ setup_ntlmv2_rsp_ret:
 }
 int
-calc_seckey(struct cifsSesInfo *ses)
+calc_seckey(struct cifs_ses *ses)
 {
        int rc;
        struct crypto_blkcipher *tfm_arc4;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5c412b33cd7c..989442dcfb45 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -104,53 +104,25 @@ cifs_sb_deactive(struct super_block *sb)
 }
 static int
-cifs_read_super(struct super_block *sb, void *data,
+cifs_read_super(struct super_block *sb, struct smb_vol *volume_info,
                const char *devname, int silent)
 {
        struct inode *inode;
        struct cifs_sb_info *cifs_sb;
        int rc = 0;
-        /* BB should we make this contingent on mount parm? */
-        sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
-        sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
        cifs_sb = CIFS_SB(sb);
-        if (cifs_sb == NULL)
-                return -ENOMEM;
        spin_lock_init(&cifs_sb->tlink_tree_lock);
        cifs_sb->tlink_tree = RB_ROOT;
        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
-        if (rc) {
+        if (rc)
-                kfree(cifs_sb);
                return rc;
-        }
-        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-#ifdef CONFIG_CIFS_DFS_UPCALL
+        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-        /* copy mount params to sb for use in submounts */
-        /* BB: should we move this after the mount so we
-         * do not have to do the copy on failed mounts?
-         * BB: May be it is better to do simple copy before
-         * complex operation (mount), and in case of fail
-         * just exit instead of doing mount and attempting
-         * undo it if this copy fails?*/
-        if (data) {
-                int len = strlen(data);
-                cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
-                if (cifs_sb->mountdata == NULL) {
-                        bdi_destroy(&cifs_sb->bdi);
-                        kfree(sb->s_fs_info);
-                        sb->s_fs_info = NULL;
-                        return -ENOMEM;
-                }
-                strncpy(cifs_sb->mountdata, data, len + 1);
-                cifs_sb->mountdata[len] = '\0';
-        }
-#endif
-        rc = cifs_mount(sb, cifs_sb, data, devname);
+        rc = cifs_mount(sb, cifs_sb, volume_info, devname);
        if (rc) {
                if (!silent)
@@ -163,7 +135,7 @@ cifs_read_super(struct super_block *sb, void *data,
        sb->s_bdi = &cifs_sb->bdi;
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
-        inode = cifs_root_iget(sb, ROOT_I);
+        inode = cifs_root_iget(sb);
        if (IS_ERR(inode)) {
                rc = PTR_ERR(inode);
@@ -184,12 +156,12 @@ cifs_read_super(struct super_block *sb, void *data,
        else
                sb->s_d_op = &cifs_dentry_ops;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cFYI(1, "export ops supported");
                sb->s_export_op = &cifs_export_ops;
        }
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
        return 0;
@@ -201,17 +173,7 @@ out_no_root:
        cifs_umount(sb, cifs_sb);
 out_mount_failed:
-        if (cifs_sb) {
+        bdi_destroy(&cifs_sb->bdi);
-#ifdef CONFIG_CIFS_DFS_UPCALL
-                if (cifs_sb->mountdata) {
-                        kfree(cifs_sb->mountdata);
-                        cifs_sb->mountdata = NULL;
-                }
-#endif
-                unload_nls(cifs_sb->local_nls);
-                bdi_destroy(&cifs_sb->bdi);
-                kfree(cifs_sb);
-        }
        return rc;
 }
@@ -231,12 +193,10 @@ cifs_put_super(struct super_block *sb)
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
                cERROR(1, "cifs_umount failed with return code %d", rc);
-#ifdef CONFIG_CIFS_DFS_UPCALL
        if (cifs_sb->mountdata) {
                kfree(cifs_sb->mountdata);
                cifs_sb->mountdata = NULL;
        }
-#endif
        unload_nls(cifs_sb->local_nls);
        bdi_destroy(&cifs_sb->bdi);
@@ -248,7 +208,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        int rc = -EOPNOTSUPP;
        int xid;
@@ -401,7 +361,7 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        struct sockaddr *srcaddr;
        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
@@ -455,14 +415,20 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",nocase");
        if (tcon->retry)
                seq_printf(s, ",hard");
-        if (cifs_sb->prepath)
+        if (tcon->unix_ext)
-                seq_printf(s, ",prepath=%s", cifs_sb->prepath);
+                seq_printf(s, ",unix");
+        else
+                seq_printf(s, ",nounix");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
                seq_printf(s, ",posixpaths");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
                seq_printf(s, ",setuids");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
                seq_printf(s, ",serverino");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                seq_printf(s, ",rwpidforward");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
+                seq_printf(s, ",forcemand");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
                seq_printf(s, ",directio");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
@@ -495,7 +461,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 static void cifs_umount_begin(struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        if (cifs_sb == NULL)
                return;
@@ -570,29 +536,189 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
+/*
+ * Get root dentry from superblock according to prefix path mount option.
+ * Return dentry with refcount + 1 on success and NULL otherwise.
+ */
+static struct dentry *
+cifs_get_root(struct smb_vol *vol, struct super_block *sb)
+{
+        int xid, rc;
+        struct inode *inode;
+        struct qstr name;
+        struct dentry *dparent = NULL, *dchild = NULL, *alias;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        unsigned int i, full_len, len;
+        char *full_path = NULL, *pstart;
+        char sep;
+        full_path = cifs_build_path_to_root(vol, cifs_sb,
+                                            cifs_sb_master_tcon(cifs_sb));
+        if (full_path == NULL)
+                return NULL;
+        cFYI(1, "Get root dentry for %s", full_path);
+        xid = GetXid();
+        sep = CIFS_DIR_SEP(cifs_sb);
+        dparent = dget(sb->s_root);
+        full_len = strlen(full_path);
+        full_path[full_len] = sep;
+        pstart = full_path + 1;
+        for (i = 1, len = 0; i <= full_len; i++) {
+                if (full_path[i] != sep || !len) {
+                        len++;
+                        continue;
+                }
+                full_path[i] = 0;
+                cFYI(1, "get dentry for %s", pstart);
+                name.name = pstart;
+                name.len = len;
+                name.hash = full_name_hash(pstart, len);
+                dchild = d_lookup(dparent, &name);
+                if (dchild == NULL) {
+                        cFYI(1, "not exists");
+                        dchild = d_alloc(dparent, &name);
+                        if (dchild == NULL) {
+                                dput(dparent);
+                                dparent = NULL;
+                                goto out;
+                        }
+                }
+                cFYI(1, "get inode");
+                if (dchild->d_inode == NULL) {
+                        cFYI(1, "not exists");
+                        inode = NULL;
+                        if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
+                                rc = cifs_get_inode_info_unix(&inode, full_path,
+                                                              sb, xid);
+                        else
+                                rc = cifs_get_inode_info(&inode, full_path,
+                                                         NULL, sb, xid, NULL);
+                        if (rc) {
+                                dput(dchild);
+                                dput(dparent);
+                                dparent = NULL;
+                                goto out;
+                        }
+                        alias = d_materialise_unique(dchild, inode);
+                        if (alias != NULL) {
+                                dput(dchild);
+                                if (IS_ERR(alias)) {
+                                        dput(dparent);
+                                        dparent = NULL;
+                                        goto out;
+                                }
+                                dchild = alias;
+                        }
+                }
+                cFYI(1, "parent %p, child %p", dparent, dchild);
+                dput(dparent);
+                dparent = dchild;
+                len = 0;
+                pstart = full_path + i + 1;
+                full_path[i] = sep;
+        }
+out:
+        _FreeXid(xid);
+        kfree(full_path);
+        return dparent;
+}
 static struct dentry *
 cifs_do_mount(struct file_system_type *fs_type,
-            int flags, const char *dev_name, void *data)
+              int flags, const char *dev_name, void *data)
 {
        int rc;
        struct super_block *sb;
+        struct cifs_sb_info *cifs_sb;
-        sb = sget(fs_type, NULL, set_anon_super, NULL);
+        struct smb_vol *volume_info;
+        struct cifs_mnt_data mnt_data;
+        struct dentry *root;
        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
-        if (IS_ERR(sb))
+        rc = cifs_setup_volume_info(&volume_info, (char *)data, dev_name);
-                return ERR_CAST(sb);
+        if (rc)
+                return ERR_PTR(rc);
+        cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
+        if (cifs_sb == NULL) {
+                root = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        cifs_setup_cifs_sb(volume_info, cifs_sb);
+        mnt_data.vol = volume_info;
+        mnt_data.cifs_sb = cifs_sb;
+        mnt_data.flags = flags;
+        sb = sget(fs_type, cifs_match_super, set_anon_super, &mnt_data);
+        if (IS_ERR(sb)) {
+                root = ERR_CAST(sb);
+                goto out_cifs_sb;
+        }
+        if (sb->s_fs_info) {
+                cFYI(1, "Use existing superblock");
+                goto out_shared;
+        }
+        /*
+         * Copy mount params for use in submounts. Better to do
+         * the copy here and deal with the error before cleanup gets
+         * complicated post-mount.
+         */
+        cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
+        if (cifs_sb->mountdata == NULL) {
+                root = ERR_PTR(-ENOMEM);
+                goto out_super;
+        }
        sb->s_flags = flags;
+        /* BB should we make this contingent on mount parm? */
+        sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
+        sb->s_fs_info = cifs_sb;
-        rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
+        rc = cifs_read_super(sb, volume_info, dev_name,
+                             flags & MS_SILENT ? 1 : 0);
        if (rc) {
-                deactivate_locked_super(sb);
+                root = ERR_PTR(rc);
-                return ERR_PTR(rc);
+                goto out_super;
        }
        sb->s_flags |= MS_ACTIVE;
-        return dget(sb->s_root);
+        root = cifs_get_root(volume_info, sb);
+        if (root == NULL)
+                goto out_super;
+        cFYI(1, "dentry root is: %p", root);
+        goto out;
+out_shared:
+        root = cifs_get_root(volume_info, sb);
+        if (root)
+                cFYI(1, "dentry root is: %p", root);
+        goto out;
+out_super:
+        kfree(cifs_sb->mountdata);
+        deactivate_locked_super(sb);
+out_cifs_sb:
+        unload_nls(cifs_sb->local_nls);
+        kfree(cifs_sb);
+out:
+        cifs_cleanup_volume_info(&volume_info);
+        return root;
 }
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -618,16 +744,31 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 {
        /* origin == SEEK_END => we must revalidate the cached file length */
        if (origin == SEEK_END) {
-                int retval;
+                int rc;
+                struct inode *inode = file->f_path.dentry->d_inode;
-                /* some applications poll for the file length in this strange
-                   way so we must seek to end on non-oplocked files by
+                /*
-                   setting the revalidate time to zero */
+                 * We need to be sure that all dirty pages are written and the
-                CIFS_I(file->f_path.dentry->d_inode)->time = 0;
+                 * server has the newest file length.
+                 */
-                retval = cifs_revalidate_file(file);
+                if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
-                if (retval < 0)
+                    inode->i_mapping->nrpages != 0) {
-                        return (loff_t)retval;
+                        rc = filemap_fdatawait(inode->i_mapping);
+                        if (rc) {
+                                mapping_set_error(inode->i_mapping, rc);
+                                return rc;
+                        }
+                }
+                /*
+                 * Some applications poll for the file length in this strange
+                 * way so we must seek to end on non-oplocked files by
+                 * setting the revalidate time to zero.
+                 */
+                CIFS_I(inode)->time = 0;
+                rc = cifs_revalidate_file_attr(file);
+                if (rc < 0)
+                        return (loff_t)rc;
        }
        return generic_file_llseek_unlocked(file, offset, origin);
 }
@@ -760,10 +901,11 @@ const struct file_operations cifs_file_strict_ops = {
 };
 const struct file_operations cifs_file_direct_ops = {
-        /* no aio, no readv -
+        /* BB reevaluate whether they can be done with directio, no cache */
-           BB reevaluate whether they can be done with directio, no cache */
+        .read = do_sync_read,
-        .read = cifs_user_read,
+        .write = do_sync_write,
-        .write = cifs_user_write,
+        .aio_read = cifs_user_readv,
+        .aio_write = cifs_user_writev,
        .open = cifs_open,
        .release = cifs_close,
        .lock = cifs_lock,
@@ -815,10 +957,11 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 };
 const struct file_operations cifs_file_direct_nobrl_ops = {
-        /* no mmap, no aio, no readv -
+        /* BB reevaluate whether they can be done with directio, no cache */
-           BB reevaluate whether they can be done with directio, no cache */
+        .read = do_sync_read,
-        .read = cifs_user_read,
+        .write = do_sync_write,
-        .write = cifs_user_write,
+        .aio_read = cifs_user_readv,
+        .aio_write = cifs_user_writev,
        .open = cifs_open,
        .release = cifs_close,
        .fsync = cifs_fsync,
@@ -981,10 +1124,10 @@ init_cifs(void)
        int rc = 0;
        cifs_proc_init();
        INIT_LIST_HEAD(&cifs_tcp_ses_list);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
        INIT_LIST_HEAD(&GlobalDnotifyReqList);
        INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
-#endif
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 /*
 *  Initialize Global counters
 */
@@ -1033,22 +1176,33 @@ init_cifs(void)
        if (rc)
                goto out_destroy_mids;
-        rc = register_filesystem(&cifs_fs_type);
-        if (rc)
-                goto out_destroy_request_bufs;
 #ifdef CONFIG_CIFS_UPCALL
        rc = register_key_type(&cifs_spnego_key_type);
        if (rc)
-                goto out_unregister_filesystem;
+                goto out_destroy_request_bufs;
-#endif
+#endif /* CONFIG_CIFS_UPCALL */
+#ifdef CONFIG_CIFS_ACL
+        rc = init_cifs_idmap();
+        if (rc)
+                goto out_register_key_type;
+#endif /* CONFIG_CIFS_ACL */
+        rc = register_filesystem(&cifs_fs_type);
+        if (rc)
+                goto out_init_cifs_idmap;
        return 0;
-#ifdef CONFIG_CIFS_UPCALL
+out_init_cifs_idmap:
-out_unregister_filesystem:
+#ifdef CONFIG_CIFS_ACL
-        unregister_filesystem(&cifs_fs_type);
+        exit_cifs_idmap();
+out_register_key_type:
 #endif
+#ifdef CONFIG_CIFS_UPCALL
+        unregister_key_type(&cifs_spnego_key_type);
 out_destroy_request_bufs:
+#endif
        cifs_destroy_request_bufs();
 out_destroy_mids:
        cifs_destroy_mids();
@@ -1070,6 +1224,10 @@ exit_cifs(void)
 #ifdef CONFIG_CIFS_DFS_UPCALL
        cifs_dfs_release_automount_timer();
 #endif
+#ifdef CONFIG_CIFS_ACL
+        cifs_destroy_idmaptrees();
+        exit_cifs_idmap();
+#endif
 #ifdef CONFIG_CIFS_UPCALL
        unregister_key_type(&cifs_spnego_key_type);
 #endif
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a9371b6578c0..64313f778ebf 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -47,7 +47,7 @@ extern void cifs_sb_deactive(struct super_block *sb);
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *);
 extern int cifs_create(struct inode *, struct dentry *, int,
                       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -59,9 +59,11 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
                       struct dentry *);
+extern int cifs_revalidate_file_attr(struct file *filp);
+extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
-extern void cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -80,12 +82,12 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
+extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-                              size_t read_size, loff_t *poffset);
+                               unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
                                 unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
+extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
-                               size_t write_size, loff_t *poffset);
+                                unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
                                  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
@@ -123,9 +125,9 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t  cifs_listxattr(struct dentry *, char *, size_t);
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
 extern const struct export_operations cifs_export_ops;
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "1.71"
+#define CIFS_VERSION   "1.72"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a5d1106fcbde..6255fa812c7a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -155,6 +155,81 @@ struct cifs_cred {
 *****************************************************************
 */
+struct smb_vol {
+        char *username;
+        char *password;
+        char *domainname;
+        char *UNC;
+        char *UNCip;
+        char *iocharset;  /* local code page for mapping to and from Unicode */
+        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
+        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+        uid_t cred_uid;
+        uid_t linux_uid;
+        gid_t linux_gid;
+        mode_t file_mode;
+        mode_t dir_mode;
+        unsigned secFlg;
+        bool retry:1;
+        bool intr:1;
+        bool setuids:1;
+        bool override_uid:1;
+        bool override_gid:1;
+        bool dynperm:1;
+        bool noperm:1;
+        bool no_psx_acl:1; /* set if posix acl support should be disabled */
+        bool cifs_acl:1;
+        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
+        bool server_ino:1; /* use inode numbers from server ie UniqueId */
+        bool direct_io:1;
+        bool strict_io:1; /* strict cache behavior */
+        bool remap:1;      /* set to remap seven reserved chars in filenames */
+        bool posix_paths:1; /* unset to not ask for posix pathnames. */
+        bool no_linux_ext:1;
+        bool sfu_emul:1;
+        bool nullauth:1;   /* attempt to authenticate with null user */
+        bool nocase:1;     /* request case insensitive filenames */
+        bool nobrl:1;      /* disable sending byte range locks to srv */
+        bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
+        bool seal:1;       /* request transport encryption on share */
+        bool nodfs:1;      /* Do not request DFS, even if available */
+        bool local_lease:1; /* check leases only on local system, not remote */
+        bool noblocksnd:1;
+        bool noautotune:1;
+        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+        bool fsc:1;     /* enable fscache */
+        bool mfsymlinks:1; /* use Minshall+French Symlinks */
+        bool multiuser:1;
+        bool rwpidforward:1; /* pid forward for read/write operations */
+        unsigned int rsize;
+        unsigned int wsize;
+        bool sockopt_tcp_nodelay:1;
+        unsigned short int port;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
+        char *prepath;
+        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
+        struct nls_table *local_nls;
+};
+#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
+                         CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
+                         CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
+                         CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \
+                         CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \
+                         CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
+                         CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
+                         CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
+                         CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO)
+#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
+                      MS_NODEV | MS_SYNCHRONOUS)
+struct cifs_mnt_data {
+        struct cifs_sb_info *cifs_sb;
+        struct smb_vol *vol;
+        int flags;
+};
 struct TCP_Server_Info {
        struct list_head tcp_ses_list;
        struct list_head smb_ses_list;
@@ -179,7 +254,7 @@ struct TCP_Server_Info {
        struct mutex srv_mutex;
        struct task_struct *tsk;
        char server_GUID[16];
-        char secMode;
+        char sec_mode;
        bool session_estab; /* mark when very first sess is established */
        u16 dialect; /* dialect index that server chose */
        enum securityEnum secType;
@@ -254,7 +329,7 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
 /*
 * Session structure.  One of these for each uid session with a particular host
 */
-struct cifsSesInfo {
+struct cifs_ses {
        struct list_head smb_ses_list;
        struct list_head tcon_list;
        struct mutex session_mutex;
@@ -274,7 +349,8 @@ struct cifsSesInfo {
        int capabilities;
        char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
                                TCP names - will ipv6 and sctp addresses fit? */
-        char *user_name;
+        char *user_name;        /* must not be null except during init of sess
+                                   and after mount option parsing we fill it */
        char *domainName;
        char *password;
        struct session_key auth_key;
@@ -293,11 +369,11 @@ struct cifsSesInfo {
 * there is one of these for each connection to a resource on a particular
 * session
 */
-struct cifsTconInfo {
+struct cifs_tcon {
        struct list_head tcon_list;
        int tc_count;
        struct list_head openFileList;
-        struct cifsSesInfo *ses;        /* pointer to session associated with */
+        struct cifs_ses *ses;   /* pointer to session associated with */
        char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
        char *nativeFileSystem;
        char *password;         /* for share-level security */
@@ -379,12 +455,12 @@ struct tcon_link {
 #define TCON_LINK_IN_TREE       2
        unsigned long           tl_time;
        atomic_t                tl_count;
-        struct cifsTconInfo     *tl_tcon;
+        struct cifs_tcon        *tl_tcon;
 };
 extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
-static inline struct cifsTconInfo *
+static inline struct cifs_tcon *
 tlink_tcon(struct tcon_link *tlink)
 {
        return tlink->tl_tcon;
@@ -401,7 +477,7 @@ cifs_get_tlink(struct tcon_link *tlink)
 }
 /* This function is always expected to succeed */
-extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
 /*
 * This info hangs off the cifsFileInfo structure, pointed to by llist.
@@ -454,6 +530,14 @@ struct cifsFileInfo {
        struct work_struct oplock_break; /* work for oplock breaks */
 };
+struct cifs_io_parms {
+        __u16 netfid;
+        __u32 pid;
+        __u64 offset;
+        unsigned int length;
+        struct cifs_tcon *tcon;
+};
 /*
 * Take a reference on the file private data. Must be called with
 * cifs_file_list_lock held.
@@ -508,10 +592,30 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
                return '\\';
 }
+static inline void
+convert_delimiter(char *path, char delim)
+{
+        int i;
+        char old_delim;
+        if (path == NULL)
+                return;
+        if (delim == '/')
+                old_delim = '\\';
+        else
+                old_delim = '/';
+        for (i = 0; path[i] != '\0'; i++) {
+                if (path[i] == old_delim)
+                        path[i] = delim;
+        }
+}
 #ifdef CONFIG_CIFS_STATS
 #define cifs_stats_inc atomic_inc
-static inline void cifs_stats_bytes_written(struct cifsTconInfo *tcon,
+static inline void cifs_stats_bytes_written(struct cifs_tcon *tcon,
                                            unsigned int bytes)
 {
        if (bytes) {
@@ -521,7 +625,7 @@ static inline void cifs_stats_bytes_written(struct cifsTconInfo *tcon,
        }
 }
-static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
+static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
                                         unsigned int bytes)
 {
        spin_lock(&tcon->stat_lock);
@@ -542,9 +646,8 @@ struct mid_q_entry;
 * This is the prototype for the mid callback function. When creating one,
 * take special care to avoid deadlocks. Things to bear in mind:
 *
- * - it will be called by cifsd
+ * - it will be called by cifsd, with no locks held
- * - the GlobalMid_Lock will be held
+ * - the mid will be removed from any lists
- * - the mid will be removed from the pending_mid_q list
 */
 typedef void (mid_callback_t)(struct mid_q_entry *mid);
@@ -572,7 +675,7 @@ struct mid_q_entry {
 struct oplock_q_entry {
        struct list_head qhead;
        struct inode *pinode;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u16 netfid;
 };
@@ -655,6 +758,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
 #define   MID_RESPONSE_MALFORMED 0x10
+#define   MID_SHUTDOWN           0x20
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
@@ -780,10 +884,12 @@ GLOBAL_EXTERN spinlock_t		cifs_tcp_ses_lock;
 */
 GLOBAL_EXTERN spinlock_t        cifs_file_list_lock;
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
 /* DirNotify response queue */
 GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 /*
 * Global transaction id (XID) information
@@ -830,6 +936,11 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 /* reconnect after this many failed echo attempts */
 GLOBAL_EXTERN unsigned short echo_retries;
+GLOBAL_EXTERN struct rb_root uidtree;
+GLOBAL_EXTERN struct rb_root gidtree;
+GLOBAL_EXTERN spinlock_t siduidlock;
+GLOBAL_EXTERN spinlock_t sidgidlock;
 void cifs_oplock_break(struct work_struct *work);
 void cifs_oplock_break_get(struct cifsFileInfo *cfile);
 void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b5c8cc5d7a7f..de3aa285de03 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -397,9 +397,9 @@
 #define GETU32(var)  (*((__u32 *)var))  /* BB check for endian issues */
 struct smb_hdr {
-        __u32 smb_buf_length;   /* big endian on wire *//* BB length is only two
+        __be32 smb_buf_length;  /* BB length is only two (rarely three) bytes,
-                or three bytes - with one or two byte type preceding it that are
+                with one or two byte "type" preceding it that will be
-                zero - we could mask the type byte off just in case BB */
+                zero - we could mask the type byte off */
        __u8 Protocol[4];
        __u8 Command;
        union {
@@ -428,43 +428,28 @@ struct smb_hdr {
        __u8 WordCount;
 } __attribute__((packed));
-/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
+/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */
-#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
+static inline void *
-                         (2 * (smb_var)->WordCount))
+BCC(struct smb_hdr *smb)
+{
+        return (void *)smb + sizeof(*smb) + 2 * smb->WordCount;
+}
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
 #define pByteArea(smb_var) (BCC(smb_var) + 2)
-/* get the converted ByteCount for a SMB packet and return it */
-static inline __u16
-get_bcc(struct smb_hdr *hdr)
-{
-        __u16 *bc_ptr = (__u16 *)BCC(hdr);
-        return get_unaligned(bc_ptr);
-}
 /* get the unconverted ByteCount for a SMB packet and return it */
 static inline __u16
-get_bcc_le(struct smb_hdr *hdr)
+get_bcc(struct smb_hdr *hdr)
 {
        __le16 *bc_ptr = (__le16 *)BCC(hdr);
        return get_unaligned_le16(bc_ptr);
 }
-/* set the ByteCount for a SMB packet in host-byte order */
-static inline void
-put_bcc(__u16 count, struct smb_hdr *hdr)
-{
-        __u16 *bc_ptr = (__u16 *)BCC(hdr);
-        put_unaligned(count, bc_ptr);
-}
 /* set the ByteCount for a SMB packet in little-endian */
 static inline void
-put_bcc_le(__u16 count, struct smb_hdr *hdr)
+put_bcc(__u16 count, struct smb_hdr *hdr)
 {
        __le16 *bc_ptr = (__le16 *)BCC(hdr);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8096f27ad9a8..953f84413c77 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -53,9 +53,13 @@ do {								\
        cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",      \
             __func__, curr_xid, (int)rc);                      \
 } while (0)
+extern int init_cifs_idmap(void);
+extern void exit_cifs_idmap(void);
+extern void cifs_destroy_idmaptrees(void);
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
-                                        struct cifsTconInfo *tcon);
+                                     struct cifs_sb_info *cifs_sb,
+                                     struct cifs_tcon *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
@@ -64,20 +68,22 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
 extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
                                        struct TCP_Server_Info *server);
 extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
-extern int cifs_call_async(struct TCP_Server_Info *server,
+extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-                           struct smb_hdr *in_buf, mid_callback_t *callback,
+                           unsigned int nvec, mid_callback_t *callback,
-                           void *cbdata);
+                           void *cbdata, bool ignore_pend);
-extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
+extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
                        int * /* bytes returned */ , const int long_op);
-extern int SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
                        struct smb_hdr *in_buf, int flags);
-extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
+extern int cifs_check_receive(struct mid_q_entry *mid,
+                        struct TCP_Server_Info *server, bool log_error);
+extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
                        struct kvec *, int /* nvec to send */,
                        int * /* type of buf returned */ , const int flags);
 extern int SendReceiveBlockingLock(const unsigned int xid,
-                        struct cifsTconInfo *ptcon,
+                        struct cifs_tcon *ptcon,
                        struct smb_hdr *in_buf ,
                        struct smb_hdr *out_buf,
                        int *bytes_returned);
@@ -90,21 +96,20 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
-extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
                        struct TCP_Server_Info *server);
 extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
 extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
 extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
                                const unsigned short int port);
-extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
+extern int map_smb_to_linux_error(struct smb_hdr *smb, bool logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
-                            const struct cifsTconInfo *, int /* length of
+                            const struct cifs_tcon *, int /* length of
                            fixed section (word count) in two byte units */);
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
-                                struct cifsSesInfo *ses,
+                                struct cifs_ses *ses,
                                void **request_buf);
-extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
+extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
                             const struct nls_table *nls_cp);
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -143,103 +148,111 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
 extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
 extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
                                        const char *, u32 *);
+extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
+                                const char *);
-extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
+extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
-                        const char *);
+                               struct cifs_sb_info *cifs_sb);
+extern int cifs_match_super(struct super_block *, void *);
+extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info);
+extern int cifs_setup_volume_info(struct smb_vol **pvolume_info,
+                                  char *mount_data, const char *devname);
+extern int cifs_mount(struct super_block *, struct cifs_sb_info *,
+                      struct smb_vol *, const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
 extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
 extern int cifs_negotiate_protocol(unsigned int xid,
-                                  struct cifsSesInfo *ses);
+                                  struct cifs_ses *ses);
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+extern int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
                        struct nls_table *nls_info);
-extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
+extern int CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses);
-extern int CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
+extern int CIFSTCon(unsigned int xid, struct cifs_ses *ses,
-                        const char *tree, struct cifsTconInfo *tcon,
+                        const char *tree, struct cifs_tcon *tcon,
                        const struct nls_table *);
-extern int CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
                const char *searchName, const struct nls_table *nls_codepage,
                __u16 *searchHandle, struct cifs_search_info *psrch_inf,
                int map, const char dirsep);
-extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
                __u16 searchHandle, struct cifs_search_info *psrch_inf);
-extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
+extern int CIFSFindClose(const int, struct cifs_tcon *tcon,
                        const __u16 search_handle);
-extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon,
                        u16 netfid, FILE_ALL_INFO *pFindData);
-extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *findData,
                        int legacy /* whether to use old info level */,
                        const struct nls_table *nls_codepage, int remap);
-extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
+extern int SMBQueryInformation(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *findData,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon,
                        u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
 extern int CIFSSMBUnixQPathInfo(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_UNIX_BASIC_INFO *pFindData,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
+extern int CIFSGetDFSRefer(const int xid, struct cifs_ses *ses,
                        const unsigned char *searchName,
                        struct dfs_info3_param **target_nodes,
                        unsigned int *number_of_nodes_in_array,
                        const struct nls_table *nls_codepage, int remap);
-extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
+extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo,
                        const char *old_path,
                        const struct nls_table *nls_codepage,
                        unsigned int *pnum_referrals,
                        struct dfs_info3_param **preferrals,
                        int remap);
-extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
+extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
                                 struct super_block *sb, struct smb_vol *vol);
-extern int CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon,
                        struct kstatfs *FSData);
-extern int SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon,
+extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon,
                        struct kstatfs *FSData);
-extern int CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon,
                        __u64 cap);
 extern int CIFSSMBQFSAttributeInfo(const int xid,
-                        struct cifsTconInfo *tcon);
+                        struct cifs_tcon *tcon);
-extern int CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon);
-extern int CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon);
-extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon,
                        struct kstatfs *FSData);
-extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, const FILE_BASIC_INFO *data,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
                        const FILE_BASIC_INFO *data, __u16 fid,
                        __u32 pid_of_opener);
-extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon,
                        bool delete_file, __u16 fid, __u32 pid_of_opener);
 #if 0
-extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon,
                        char *fileName, __u16 dos_attributes,
                        const struct nls_table *nls_codepage);
 #endif /* possibly unneeded function */
-extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, __u64 size,
                        bool setAllocationSizeFlag,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon,
                         __u64 size, __u16 fileHandle, __u32 opener_pid,
                        bool AllocSizeFlag);
@@ -253,121 +266,116 @@ struct cifs_unix_set_info_args {
        dev_t   device;
 };
-extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
                                  const struct cifs_unix_set_info_args *args,
                                  u16 fid, u32 pid_of_opener);
-extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *pTcon,
+extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *pTcon,
                        char *fileName,
                        const struct cifs_unix_set_info_args *args,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon,
                        const char *newName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon,
                        const char *name, const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon,
                        const char *name, __u16 type,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBDelFile(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon,
                        const char *name,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBRename(const int xid, struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
+extern int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
                        int netfid, const char *target_name,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSCreateHardLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSUnixCreateHardLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSUnixCreateSymLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage);
 extern int CIFSSMBUnixQuerySymLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const unsigned char *searchName, char **syminfo,
                        const struct nls_table *nls_codepage);
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
 extern int CIFSSMBQueryReparseLinkInfo(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        char *symlinkinfo, const int buflen, __u16 fid,
                        const struct nls_table *nls_codepage);
+#endif /* temporarily unused until cifs_symlink fixed */
-extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBOpen(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, const int disposition,
                        const int access_flags, const int omode,
                        __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
                        const struct nls_table *nls_codepage, int remap);
-extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
+extern int SMBLegacyOpen(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, const int disposition,
                        const int access_flags, const int omode,
                        __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon,
                        u32 posix_flags, __u64 mode, __u16 *netfid,
                        FILE_UNIX_BASIC_INFO *pRetData,
                        __u32 *pOplock, const char *name,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBClose(const int xid, struct cifs_tcon *tcon,
                        const int smb_file_id);
-extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBFlush(const int xid, struct cifs_tcon *tcon,
                        const int smb_file_id);
-extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms,
-                        const int netfid, unsigned int count,
+                        unsigned int *nbytes, char **buf,
-                        const __u64 lseek, unsigned int *nbytes, char **buf,
                        int *return_buf_type);
-extern int CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms,
-                        const int netfid, const unsigned int count,
+                        unsigned int *nbytes, const char *buf,
-                        const __u64 lseek, unsigned int *nbytes,
+                        const char __user *ubuf, const int long_op);
-                        const char *buf, const char __user *ubuf,
+extern int CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
+                        unsigned int *nbytes, struct kvec *iov, const int nvec,
                        const int long_op);
-extern int CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
-                        const int netfid, const unsigned int count,
-                        const __u64 offset, unsigned int *nbytes,
-                        struct kvec *iov, const int nvec, const int long_op);
-extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName, __u64 *inode_number,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-                        const struct nls_table *cp, int mapChars);
-extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
                        const __u16 netfid, const __u64 len,
                        const __u64 offset, const __u32 numUnlock,
                        const __u32 numLock, const __u8 lockType,
                        const bool waitFlag, const __u8 oplock_level);
-extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
                        const __u16 smb_file_id, const int get_flag,
                        const __u64 len, struct file_lock *,
                        const __u16 lock_type, const bool waitFlag);
-extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
 extern int CIFSSMBEcho(struct TCP_Server_Info *server);
-extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
+extern int CIFSSMBLogoff(const int xid, struct cifs_ses *ses);
-extern struct cifsSesInfo *sesInfoAlloc(void);
+extern struct cifs_ses *sesInfoAlloc(void);
-extern void sesInfoFree(struct cifsSesInfo *);
+extern void sesInfoFree(struct cifs_ses *);
-extern struct cifsTconInfo *tconInfoAlloc(void);
+extern struct cifs_tcon *tconInfoAlloc(void);
-extern void tconInfoFree(struct cifsTconInfo *);
+extern void tconInfoFree(struct cifs_tcon *);
 extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
@@ -376,49 +384,51 @@ extern int cifs_verify_signature(struct smb_hdr *,
                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
 extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-extern int setup_ntlm_response(struct cifsSesInfo *);
+extern int setup_ntlm_response(struct cifs_ses *);
-extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
+extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
-extern int calc_seckey(struct cifsSesInfo *);
+extern int calc_seckey(struct cifs_ses *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-extern void calc_lanman_hash(const char *password, const char *cryptkey,
+extern int calc_lanman_hash(const char *password, const char *cryptkey,
                                bool encrypt, char *lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+extern int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
+                        const int notify_subdirs, const __u16 netfid,
+                        __u32 filter, struct file *file, int multishot,
+                        const struct nls_table *nls_codepage);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 extern int CIFSSMBCopy(int xid,
-                        struct cifsTconInfo *source_tcon,
+                        struct cifs_tcon *source_tcon,
                        const char *fromName,
                        const __u16 target_tid,
                        const char *toName, const int flags,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
+extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
-                        const int notify_subdirs, const __u16 netfid,
-                        __u32 filter, struct file *file, int multishot,
-                        const struct nls_table *nls_codepage);
-extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
                        const unsigned char *ea_name, char *EAData,
                        size_t bufsize, const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
                const char *fileName, const char *ea_name,
                const void *ea_value, const __u16 ea_value_len,
                const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
                        __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
-extern int CIFSSMBSetCIFSACL(const int, struct cifsTconInfo *, __u16,
+extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
                        struct cifs_ntsd *, __u32);
-extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
                const unsigned char *searchName,
                char *acl_inf, const int buflen, const int acl_type,
                const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon,
                const unsigned char *fileName,
                const char *local_acl, const int buflen, const int acl_type,
                const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon,
                        const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
 extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
@@ -427,9 +437,24 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
                struct cifs_sb_info *cifs_sb, int xid);
 extern int mdfour(unsigned char *, unsigned char *, int);
 extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-                        unsigned char *p24);
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
                        unsigned char *p24);
+/* asynchronous write support */
+struct cifs_writedata {
+        struct kref                     refcount;
+        enum writeback_sync_modes       sync_mode;
+        struct work_struct              work;
+        struct cifsFileInfo             *cfile;
+        __u64                           offset;
+        unsigned int                    bytes;
+        int                             result;
+        unsigned int                    nr_pages;
+        struct page                     *pages[1];
+};
+int cifs_async_writev(struct cifs_writedata *wdata);
+struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages);
+void cifs_writedata_release(struct kref *refcount);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index df959bae6728..1a9fe7f816d1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -32,6 +32,7 @@
 #include <linux/vfs.h>
 #include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/pagemap.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -84,7 +85,7 @@ static struct {
 /* Mark as invalid, all open files on tree connections since they
   were closed when session to server was lost */
-static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
+static void mark_open_files_invalid(struct cifs_tcon *pTcon)
 {
        struct cifsFileInfo *open_file = NULL;
        struct list_head *tmp;
@@ -104,10 +105,10 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 /* reconnect the socket, tcon, and smb session if needed */
 static int
-cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
+cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
 {
        int rc = 0;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        struct TCP_Server_Info *server;
        struct nls_table *nls_codepage;
@@ -226,7 +227,7 @@ out:
   SMB information in the SMB header.  If the return code is zero, this
   function must have filled in request_buf pointer */
 static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+small_smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
                void **request_buf)
 {
        int rc;
@@ -252,7 +253,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 int
 small_smb_init_no_tc(const int smb_command, const int wct,
-                     struct cifsSesInfo *ses, void **request_buf)
+                     struct cifs_ses *ses, void **request_buf)
 {
        int rc;
        struct smb_hdr *buffer;
@@ -278,7 +279,7 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+__smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
                        void **request_buf, void **response_buf)
 {
        *request_buf = cifs_buf_get();
@@ -304,7 +305,7 @@ __smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
         void **request_buf, void **response_buf)
 {
        int rc;
@@ -317,7 +318,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 }
 static int
-smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon,
                        void **request_buf, void **response_buf)
 {
        if (tcon->ses->need_reconnect || tcon->need_reconnect)
@@ -339,12 +340,13 @@ static int validate_t2(struct smb_t2_rsp *pSMB)
            get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
                goto vt2_err;
-        /* check that bcc is at least as big as parms + data */
-        /* check that bcc is less than negotiated smb buffer */
        total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
        if (total_size >= 512)
                goto vt2_err;
+        /* check that bcc is at least as big as parms + data, and that it is
+         * less than negotiated smb buffer
+         */
        total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
        if (total_size > get_bcc(&pSMB->hdr) ||
            total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
@@ -357,8 +359,15 @@ vt2_err:
        return -EINVAL;
 }
+static inline void inc_rfc1001_len(void *pSMB, int count)
+{
+        struct smb_hdr *hdr = (struct smb_hdr *)pSMB;
+        be32_add_cpu(&hdr->smb_buf_length, count);
+}
 int
-CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
+CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
 {
        NEGOTIATE_REQ *pSMB;
        NEGOTIATE_RSP *pSMBr;
@@ -409,7 +418,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                count += strlen(protocols[i].name) + 1;
                /* null at end of source and target buffers anyway */
        }
-        pSMB->hdr.smb_buf_length += count;
+        inc_rfc1001_len(pSMB, count);
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
@@ -442,7 +451,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        rc = -EOPNOTSUPP;
                        goto neg_err_exit;
                }
-                server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
+                server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
                server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
@@ -496,7 +505,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
                        memcpy(ses->server->cryptkey, rsp->EncryptionKey,
                                CIFS_CRYPTO_KEY_SIZE);
-                } else if (server->secMode & SECMODE_PW_ENCRYPT) {
+                } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
                        rc = -EIO; /* need cryptkey unless plain text */
                        goto neg_err_exit;
                }
@@ -518,11 +527,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                goto neg_err_exit;
        }
        /* else wct == 17 NTLM */
-        server->secMode = pSMBr->SecurityMode;
+        server->sec_mode = pSMBr->SecurityMode;
-        if ((server->secMode & SECMODE_USER) == 0)
+        if ((server->sec_mode & SECMODE_USER) == 0)
                cFYI(1, "share mode security");
-        if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
+        if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
@@ -541,10 +550,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                server->secType = RawNTLMSSP;
        else if (secFlags & CIFSSEC_MAY_LANMAN)
                server->secType = LANMAN;
-/* #ifdef CONFIG_CIFS_EXPERIMENTAL
-        else if (secFlags & CIFSSEC_MAY_PLNTXT)
-                server->secType = ??
-#endif */
        else {
                rc = -EOPNOTSUPP;
                cERROR(1, "Invalid security type");
@@ -566,19 +571,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
                memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
                       CIFS_CRYPTO_KEY_SIZE);
-        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
+        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
-                        && (pSMBr->EncryptionKeyLength == 0)) {
+                        server->capabilities & CAP_EXTENDED_SECURITY) &&
+                                (pSMBr->EncryptionKeyLength == 0)) {
                /* decode security blob */
-        } else if (server->secMode & SECMODE_PW_ENCRYPT) {
+                count = get_bcc(&pSMBr->hdr);
-                rc = -EIO; /* no crypt key only if plain text pwd */
-                goto neg_err_exit;
-        }
-        /* BB might be helpful to save off the domain of server here */
-        if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) &&
-                (server->capabilities & CAP_EXTENDED_SECURITY)) {
-                count = pSMBr->ByteCount;
                if (count < 16) {
                        rc = -EIO;
                        goto neg_err_exit;
@@ -620,6 +617,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        } else
                                        rc = -EOPNOTSUPP;
                }
+        } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+                rc = -EIO; /* no crypt key only if plain text pwd */
+                goto neg_err_exit;
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -630,27 +630,27 @@ signing_check:
                /* MUST_SIGN already includes the MAY_SIGN FLAG
                   so if this is zero it means that signing is disabled */
                cFYI(1, "Signing disabled");
-                if (server->secMode & SECMODE_SIGN_REQUIRED) {
+                if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
                        cERROR(1, "Server requires "
                                   "packet signing to be enabled in "
                                   "/proc/fs/cifs/SecurityFlags.");
                        rc = -EOPNOTSUPP;
                }
-                server->secMode &=
+                server->sec_mode &=
                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
                /* signing required */
                cFYI(1, "Must sign - secFlags 0x%x", secFlags);
-                if ((server->secMode &
+                if ((server->sec_mode &
                        (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
                        cERROR(1, "signing required but server lacks support");
                        rc = -EOPNOTSUPP;
                } else
-                        server->secMode |= SECMODE_SIGN_REQUIRED;
+                        server->sec_mode |= SECMODE_SIGN_REQUIRED;
        } else {
                /* signing optional ie CIFSSEC_MAY_SIGN */
-                if ((server->secMode & SECMODE_SIGN_REQUIRED) == 0)
+                if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0)
-                        server->secMode &=
+                        server->sec_mode &=
                                ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        }
@@ -662,7 +662,7 @@ neg_err_exit:
 }
 int
-CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBTDis(const int xid, struct cifs_tcon *tcon)
 {
        struct smb_hdr *smb_buffer;
        int rc = 0;
@@ -721,6 +721,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 {
        ECHO_REQ *smb;
        int rc = 0;
+        struct kvec iov;
        cFYI(1, "In echo request");
@@ -732,12 +733,13 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
        smb->hdr.Tid = 0xffff;
        smb->hdr.WordCount = 1;
        put_unaligned_le16(1, &smb->EchoCount);
-        put_bcc_le(1, &smb->hdr);
+        put_bcc(1, &smb->hdr);
        smb->Data[0] = 'a';
-        smb->hdr.smb_buf_length += 3;
+        inc_rfc1001_len(smb, 3);
+        iov.iov_base = smb;
+        iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
-        rc = cifs_call_async(server, (struct smb_hdr *)smb,
+        rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
-                                cifs_echo_callback, server);
        if (rc)
                cFYI(1, "Echo request failed: %d", rc);
@@ -747,7 +749,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 }
 int
-CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
+CIFSSMBLogoff(const int xid, struct cifs_ses *ses)
 {
        LOGOFF_ANDX_REQ *pSMB;
        int rc = 0;
@@ -774,7 +776,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        pSMB->hdr.Mid = GetNextMid(ses->server);
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -794,7 +796,7 @@ session_already_dead:
 }
 int
-CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName,
                 __u16 type, const struct nls_table *nls_codepage, int remap)
 {
        TRANSACTION2_SPI_REQ *pSMB = NULL;
@@ -852,7 +854,7 @@ PsxDelete:
        pSMB->TotalParameterCount = pSMB->ParameterCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -869,7 +871,7 @@ PsxDelete:
 }
 int
-CIFSSMBDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName,
               const struct nls_table *nls_codepage, int remap)
 {
        DELETE_FILE_REQ *pSMB = NULL;
@@ -898,7 +900,7 @@ DelFileRetry:
        pSMB->SearchAttributes =
            cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
        pSMB->BufferFormat = 0x04;
-        pSMB->hdr.smb_buf_length += name_len + 1;
+        inc_rfc1001_len(pSMB, name_len + 1);
        pSMB->ByteCount = cpu_to_le16(name_len + 1);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -914,7 +916,7 @@ DelFileRetry:
 }
 int
-CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
+CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon, const char *dirName,
             const struct nls_table *nls_codepage, int remap)
 {
        DELETE_DIRECTORY_REQ *pSMB = NULL;
@@ -942,7 +944,7 @@ RmDirRetry:
        }
        pSMB->BufferFormat = 0x04;
-        pSMB->hdr.smb_buf_length += name_len + 1;
+        inc_rfc1001_len(pSMB, name_len + 1);
        pSMB->ByteCount = cpu_to_le16(name_len + 1);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -957,7 +959,7 @@ RmDirRetry:
 }
 int
-CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon,
             const char *name, const struct nls_table *nls_codepage, int remap)
 {
        int rc = 0;
@@ -985,7 +987,7 @@ MkDirRetry:
        }
        pSMB->BufferFormat = 0x04;
-        pSMB->hdr.smb_buf_length += name_len + 1;
+        inc_rfc1001_len(pSMB, name_len + 1);
        pSMB->ByteCount = cpu_to_le16(name_len + 1);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -1000,7 +1002,7 @@ MkDirRetry:
 }
 int
-CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
+CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon, __u32 posix_flags,
                __u64 mode, __u16 *netfid, FILE_UNIX_BASIC_INFO *pRetData,
                __u32 *pOplock, const char *name,
                const struct nls_table *nls_codepage, int remap)
@@ -1063,7 +1065,7 @@ PsxCreat:
        pSMB->TotalParameterCount = pSMB->ParameterCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -1075,7 +1077,7 @@ PsxCreat:
        cFYI(1, "copying inode info");
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-        if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
+        if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
                rc = -EIO;      /* bad smb */
                goto psx_create_err;
        }
@@ -1096,7 +1098,7 @@ PsxCreat:
                pRetData->Type = cpu_to_le32(-1); /* unknown */
                cFYI(DBG2, "unknown type");
        } else {
-                if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
+                if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
                                        + sizeof(FILE_UNIX_BASIC_INFO)) {
                        cERROR(1, "Open response data too small");
                        pRetData->Type = cpu_to_le32(-1);
@@ -1166,7 +1168,7 @@ access_flags_to_smbopen_mode(const int access_flags)
 }
 int
-SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
+SMBLegacyOpen(const int xid, struct cifs_tcon *tcon,
            const char *fileName, const int openDisposition,
            const int access_flags, const int create_options, __u16 *netfid,
            int *pOplock, FILE_ALL_INFO *pfile_info,
@@ -1228,7 +1230,7 @@ OldOpenRetry:
        pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY);
        pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition));
        count += name_len;
-        pSMB->hdr.smb_buf_length += count;
+        inc_rfc1001_len(pSMB, count);
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
@@ -1273,7 +1275,7 @@ OldOpenRetry:
 }
 int
-CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBOpen(const int xid, struct cifs_tcon *tcon,
            const char *fileName, const int openDisposition,
            const int access_flags, const int create_options, __u16 *netfid,
            int *pOplock, FILE_ALL_INFO *pfile_info,
@@ -1341,7 +1343,7 @@ openRetry:
            SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
        count += name_len;
-        pSMB->hdr.smb_buf_length += count;
+        inc_rfc1001_len(pSMB, count);
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
@@ -1375,8 +1377,7 @@ openRetry:
 }
 int
-CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
+CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
-            const unsigned int count, const __u64 lseek, unsigned int *nbytes,
            char **buf, int *pbuf_type)
 {
        int rc = -EACCES;
@@ -1386,13 +1387,18 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        int wct;
        int resp_buf_type = 0;
        struct kvec iov[1];
+        __u32 pid = io_parms->pid;
+        __u16 netfid = io_parms->netfid;
+        __u64 offset = io_parms->offset;
+        struct cifs_tcon *tcon = io_parms->tcon;
+        unsigned int count = io_parms->length;
        cFYI(1, "Reading %d bytes on fid %d", count, netfid);
        if (tcon->ses->capabilities & CAP_LARGE_FILES)
                wct = 12;
        else {
                wct = 10; /* old style read */
-                if ((lseek >> 32) > 0)  {
+                if ((offset >> 32) > 0)  {
                        /* can not handle this big offset for old */
                        return -EIO;
                }
@@ -1403,15 +1409,18 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        if (rc)
                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
        /* tcon and ses pointer are checked in smb_init */
        if (tcon->ses->server == NULL)
                return -ECONNABORTED;
        pSMB->AndXCommand = 0xFF;       /* none */
        pSMB->Fid = netfid;
-        pSMB->OffsetLow = cpu_to_le32(lseek & 0xFFFFFFFF);
+        pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
        if (wct == 12)
-                pSMB->OffsetHigh = cpu_to_le32(lseek >> 32);
+                pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
        pSMB->Remaining = 0;
        pSMB->MaxCount = cpu_to_le16(count & 0xFFFF);
@@ -1426,7 +1435,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        }
        iov[0].iov_base = (char *)pSMB;
-        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+        iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
                         &resp_buf_type, CIFS_LOG_ERROR);
        cifs_stats_inc(&tcon->num_reads);
@@ -1480,9 +1489,8 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 int
-CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms,
-             const int netfid, const unsigned int count,
+             unsigned int *nbytes, const char *buf,
-             const __u64 offset, unsigned int *nbytes, const char *buf,
             const char __user *ubuf, const int long_op)
 {
        int rc = -EACCES;
@@ -1491,6 +1499,11 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned, wct;
        __u32 bytes_sent;
        __u16 byte_count;
+        __u32 pid = io_parms->pid;
+        __u16 netfid = io_parms->netfid;
+        __u64 offset = io_parms->offset;
+        struct cifs_tcon *tcon = io_parms->tcon;
+        unsigned int count = io_parms->length;
        *nbytes = 0;
@@ -1512,6 +1525,10 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
                      (void **) &pSMBr);
        if (rc)
                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
        /* tcon and ses pointer are checked in smb_init */
        if (tcon->ses->server == NULL)
                return -ECONNABORTED;
@@ -1560,7 +1577,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF);
        pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        if (wct == 14)
                pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -1598,17 +1615,259 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        return rc;
 }
+void
+cifs_writedata_release(struct kref *refcount)
+{
+        struct cifs_writedata *wdata = container_of(refcount,
+                                        struct cifs_writedata, refcount);
+        if (wdata->cfile)
+                cifsFileInfo_put(wdata->cfile);
+        kfree(wdata);
+}
+/*
+ * Write failed with a retryable error. Resend the write request. It's also
+ * possible that the page was redirtied so re-clean the page.
+ */
+static void
+cifs_writev_requeue(struct cifs_writedata *wdata)
+{
+        int i, rc;
+        struct inode *inode = wdata->cfile->dentry->d_inode;
+        for (i = 0; i < wdata->nr_pages; i++) {
+                lock_page(wdata->pages[i]);
+                clear_page_dirty_for_io(wdata->pages[i]);
+        }
+        do {
+                rc = cifs_async_writev(wdata);
+        } while (rc == -EAGAIN);
+        for (i = 0; i < wdata->nr_pages; i++) {
+                if (rc != 0)
+                        SetPageError(wdata->pages[i]);
+                unlock_page(wdata->pages[i]);
+        }
+        mapping_set_error(inode->i_mapping, rc);
+        kref_put(&wdata->refcount, cifs_writedata_release);
+}
+static void
+cifs_writev_complete(struct work_struct *work)
+{
+        struct cifs_writedata *wdata = container_of(work,
+                                                struct cifs_writedata, work);
+        struct inode *inode = wdata->cfile->dentry->d_inode;
+        int i = 0;
+        if (wdata->result == 0) {
+                cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+                cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
+                                         wdata->bytes);
+        } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
+                return cifs_writev_requeue(wdata);
+        for (i = 0; i < wdata->nr_pages; i++) {
+                struct page *page = wdata->pages[i];
+                if (wdata->result == -EAGAIN)
+                        __set_page_dirty_nobuffers(page);
+                else if (wdata->result < 0)
+                        SetPageError(page);
+                end_page_writeback(page);
+                page_cache_release(page);
+        }
+        if (wdata->result != -EAGAIN)
+                mapping_set_error(inode->i_mapping, wdata->result);
+        kref_put(&wdata->refcount, cifs_writedata_release);
+}
+struct cifs_writedata *
+cifs_writedata_alloc(unsigned int nr_pages)
+{
+        struct cifs_writedata *wdata;
+        /* this would overflow */
+        if (nr_pages == 0) {
+                cERROR(1, "%s: called with nr_pages == 0!", __func__);
+                return NULL;
+        }
+        /* writedata + number of page pointers */
+        wdata = kzalloc(sizeof(*wdata) +
+                        sizeof(struct page *) * (nr_pages - 1), GFP_NOFS);
+        if (wdata != NULL) {
+                INIT_WORK(&wdata->work, cifs_writev_complete);
+                kref_init(&wdata->refcount);
+        }
+        return wdata;
+}
+/*
+ * Check the midState and signature on received buffer (if any), and queue the
+ * workqueue completion task.
+ */
+static void
+cifs_writev_callback(struct mid_q_entry *mid)
+{
+        struct cifs_writedata *wdata = mid->callback_data;
+        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+        unsigned int written;
+        WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+        switch (mid->midState) {
+        case MID_RESPONSE_RECEIVED:
+                wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
+                if (wdata->result != 0)
+                        break;
+                written = le16_to_cpu(smb->CountHigh);
+                written <<= 16;
+                written += le16_to_cpu(smb->Count);
+                /*
+                 * Mask off high 16 bits when bytes written as returned
+                 * by the server is greater than bytes requested by the
+                 * client. OS/2 servers are known to set incorrect
+                 * CountHigh values.
+                 */
+                if (written > wdata->bytes)
+                        written &= 0xFFFF;
+                if (written < wdata->bytes)
+                        wdata->result = -ENOSPC;
+                else
+                        wdata->bytes = written;
+                break;
+        case MID_REQUEST_SUBMITTED:
+        case MID_RETRY_NEEDED:
+                wdata->result = -EAGAIN;
+                break;
+        default:
+                wdata->result = -EIO;
+                break;
+        }
+        queue_work(system_nrt_wq, &wdata->work);
+        DeleteMidQEntry(mid);
+        atomic_dec(&tcon->ses->server->inFlight);
+        wake_up(&tcon->ses->server->request_q);
+}
+/* cifs_async_writev - send an async write, and set up mid to handle result */
+int
+cifs_async_writev(struct cifs_writedata *wdata)
+{
+        int i, rc = -EACCES;
+        WRITE_REQ *smb = NULL;
+        int wct;
+        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+        struct inode *inode = wdata->cfile->dentry->d_inode;
+        struct kvec *iov = NULL;
+        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
+                wct = 14;
+        } else {
+                wct = 12;
+                if (wdata->offset >> 32 > 0) {
+                        /* can not handle big offset for old srv */
+                        return -EIO;
+                }
+        }
+        rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **)&smb);
+        if (rc)
+                goto async_writev_out;
+        /* 1 iov per page + 1 for header */
+        iov = kzalloc((wdata->nr_pages + 1) * sizeof(*iov), GFP_NOFS);
+        if (iov == NULL) {
+                rc = -ENOMEM;
+                goto async_writev_out;
+        }
+        smb->hdr.Pid = cpu_to_le16((__u16)wdata->cfile->pid);
+        smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->cfile->pid >> 16));
+        smb->AndXCommand = 0xFF;        /* none */
+        smb->Fid = wdata->cfile->netfid;
+        smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
+        if (wct == 14)
+                smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
+        smb->Reserved = 0xFFFFFFFF;
+        smb->WriteMode = 0;
+        smb->Remaining = 0;
+        smb->DataOffset =
+            cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+        /* 4 for RFC1001 length + 1 for BCC */
+        iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
+        iov[0].iov_base = smb;
+        /* marshal up the pages into iov array */
+        wdata->bytes = 0;
+        for (i = 0; i < wdata->nr_pages; i++) {
+                iov[i + 1].iov_len = min(inode->i_size -
+                                      page_offset(wdata->pages[i]),
+                                        (loff_t)PAGE_CACHE_SIZE);
+                iov[i + 1].iov_base = kmap(wdata->pages[i]);
+                wdata->bytes += iov[i + 1].iov_len;
+        }
+        cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
+        smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
+        smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
+        if (wct == 14) {
+                inc_rfc1001_len(&smb->hdr, wdata->bytes + 1);
+                put_bcc(wdata->bytes + 1, &smb->hdr);
+        } else {
+                /* wct == 12 */
+                struct smb_com_writex_req *smbw =
+                                (struct smb_com_writex_req *)smb;
+                inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
+                put_bcc(wdata->bytes + 5, &smbw->hdr);
+                iov[0].iov_len += 4; /* pad bigger by four bytes */
+        }
+        kref_get(&wdata->refcount);
+        rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
+                             cifs_writev_callback, wdata, false);
+        if (rc == 0)
+                cifs_stats_inc(&tcon->num_writes);
+        else
+                kref_put(&wdata->refcount, cifs_writedata_release);
+        /* send is done, unmap pages */
+        for (i = 0; i < wdata->nr_pages; i++)
+                kunmap(wdata->pages[i]);
+async_writev_out:
+        cifs_small_buf_release(smb);
+        kfree(iov);
+        return rc;
+}
 int
-CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
-             const int netfid, const unsigned int count,
+              unsigned int *nbytes, struct kvec *iov, int n_vec,
-             const __u64 offset, unsigned int *nbytes, struct kvec *iov,
+              const int long_op)
-             int n_vec, const int long_op)
 {
        int rc = -EACCES;
        WRITE_REQ *pSMB = NULL;
        int wct;
        int smb_hdr_len;
        int resp_buf_type = 0;
+        __u32 pid = io_parms->pid;
+        __u16 netfid = io_parms->netfid;
+        __u64 offset = io_parms->offset;
+        struct cifs_tcon *tcon = io_parms->tcon;
+        unsigned int count = io_parms->length;
        *nbytes = 0;
@@ -1626,6 +1885,10 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB);
        if (rc)
                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
        /* tcon and ses pointer are checked in smb_init */
        if (tcon->ses->server == NULL)
                return -ECONNABORTED;
@@ -1644,11 +1907,12 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF);
        pSMB->DataLengthHigh = cpu_to_le16(count >> 16);
-        smb_hdr_len = pSMB->hdr.smb_buf_length + 1; /* hdr + 1 byte pad */
+        /* header + 1 byte pad */
+        smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1;
        if (wct == 14)
-                pSMB->hdr.smb_buf_length += count+1;
+                inc_rfc1001_len(pSMB, count + 1);
        else /* wct == 12 */
-                pSMB->hdr.smb_buf_length += count+5; /* smb data starts later */
+                inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */
        if (wct == 14)
                pSMB->ByteCount = cpu_to_le16(count + 1);
        else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ {
@@ -1700,7 +1964,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 int
-CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
            const __u16 smb_file_id, const __u64 len,
            const __u64 offset, const __u32 numUnlock,
            const __u32 numLock, const __u8 lockType,
@@ -1748,7 +2012,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
                /* oplock break */
                count = 0;
        }
-        pSMB->hdr.smb_buf_length += count;
+        inc_rfc1001_len(pSMB, count);
        pSMB->ByteCount = cpu_to_le16(count);
        if (waitFlag) {
@@ -1770,7 +2034,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
                const __u16 smb_file_id, const int get_flag, const __u64 len,
                struct file_lock *pLockData, const __u16 lock_type,
                const bool waitFlag)
@@ -1839,14 +2103,14 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        pSMB->Fid = smb_file_id;
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        if (waitFlag) {
                rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
                        (struct smb_hdr *) pSMBr, &bytes_returned);
        } else {
                iov[0].iov_base = (char *)pSMB;
-                iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+                iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
                rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
                                &resp_buf_type, timeout);
                pSMB = NULL; /* request buf already freed by SendReceive2. Do
@@ -1862,7 +2126,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                __u16 data_count;
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) {
+                if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) {
                        rc = -EIO;      /* bad smb */
                        goto plk_err_exit;
                }
@@ -1908,7 +2172,7 @@ plk_err_exit:
 int
-CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+CIFSSMBClose(const int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
        int rc = 0;
        CLOSE_REQ *pSMB = NULL;
@@ -1941,7 +2205,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 int
-CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+CIFSSMBFlush(const int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
        int rc = 0;
        FLUSH_REQ *pSMB = NULL;
@@ -1962,7 +2226,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 int
-CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBRename(const int xid, struct cifs_tcon *tcon,
              const char *fromName, const char *toName,
              const struct nls_table *nls_codepage, int remap)
 {
@@ -2012,7 +2276,7 @@ renameRetry:
        }
        count = 1 /* 1st signature byte */  + name_len + name_len2;
-        pSMB->hdr.smb_buf_length += count;
+        inc_rfc1001_len(pSMB, count);
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2029,7 +2293,7 @@ renameRetry:
        return rc;
 }
-int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
+int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
                int netfid, const char *target_name,
                const struct nls_table *nls_codepage, int remap)
 {
@@ -2092,7 +2356,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
        pSMB->InformationLevel =
                cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2109,7 +2373,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 }
 int
-CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
+CIFSSMBCopy(const int xid, struct cifs_tcon *tcon, const char *fromName,
            const __u16 target_tid, const char *toName, const int flags,
            const struct nls_table *nls_codepage, int remap)
 {
@@ -2159,7 +2423,7 @@ copyRetry:
        }
        count = 1 /* 1st signature byte */  + name_len + name_len2;
-        pSMB->hdr.smb_buf_length += count;
+        inc_rfc1001_len(pSMB, count);
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2177,7 +2441,7 @@ copyRetry:
 }
 int
-CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSUnixCreateSymLink(const int xid, struct cifs_tcon *tcon,
                      const char *fromName, const char *toName,
                      const struct nls_table *nls_codepage)
 {
@@ -2249,7 +2513,7 @@ createSymLinkRetry:
        pSMB->DataOffset = cpu_to_le16(offset);
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2266,7 +2530,7 @@ createSymLinkRetry:
 }
 int
-CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
+CIFSUnixCreateHardLink(const int xid, struct cifs_tcon *tcon,
                       const char *fromName, const char *toName,
                       const struct nls_table *nls_codepage, int remap)
 {
@@ -2335,7 +2599,7 @@ createHardLinkRetry:
        pSMB->DataOffset = cpu_to_le16(offset);
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2351,7 +2615,7 @@ createHardLinkRetry:
 }
 int
-CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
+CIFSCreateHardLink(const int xid, struct cifs_tcon *tcon,
                   const char *fromName, const char *toName,
                   const struct nls_table *nls_codepage, int remap)
 {
@@ -2406,7 +2670,7 @@ winCreateHardLinkRetry:
        }
        count = 1 /* string type byte */  + name_len + name_len2;
-        pSMB->hdr.smb_buf_length += count;
+        inc_rfc1001_len(pSMB, count);
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2423,7 +2687,7 @@ winCreateHardLinkRetry:
 }
 int
-CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQuerySymLink(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName, char **symlinkinfo,
                        const struct nls_table *nls_codepage)
 {
@@ -2477,7 +2741,7 @@ querySymLinkRetry:
        pSMB->ParameterCount = pSMB->TotalParameterCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2489,7 +2753,7 @@ querySymLinkRetry:
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                /* BB also check enough total bytes returned */
-                if (rc || (pSMBr->ByteCount < 2))
+                if (rc || get_bcc(&pSMBr->hdr) < 2)
                        rc = -EIO;
                else {
                        bool is_unicode;
@@ -2516,9 +2780,19 @@ querySymLinkRetry:
        return rc;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
+/*
+ *      Recent Windows versions now create symlinks more frequently
+ *      and they use the "reparse point" mechanism below.  We can of course
+ *      do symlinks nicely to Samba and other servers which support the
+ *      CIFS Unix Extensions and we can also do SFU symlinks and "client only"
+ *      "MF" symlinks optionally, but for recent Windows we really need to
+ *      reenable the code below and fix the cifs_symlink callers to handle this.
+ *      In the interim this code has been moved to its own config option so
+ *      it is not compiled in by default until callers fixed up and more tested.
+ */
 int
-CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        char *symlinkinfo, const int buflen, __u16 fid,
                        const struct nls_table *nls_codepage)
@@ -2561,14 +2835,14 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        } else {                /* decode response */
                __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
                __u32 data_count = le32_to_cpu(pSMBr->DataCount);
-                if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
+                if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
-                /* BB also check enough total bytes returned */
+                        /* BB also check enough total bytes returned */
                        rc = -EIO;      /* bad smb */
                        goto qreparse_out;
                }
                if (data_count && (data_count < 2048)) {
                        char *end_of_smb = 2 /* sizeof byte count */ +
-                                pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
+                               get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
                        struct reparse_data *reparse_buf =
                                                (struct reparse_data *)
@@ -2618,7 +2892,7 @@ qreparse_out:
        return rc;
 }
-#endif /* CIFS_EXPERIMENTAL */
+#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */
 #ifdef CONFIG_CIFS_POSIX
@@ -2756,7 +3030,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 }
 int
-CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
                   const unsigned char *searchName,
                   char *acl_inf, const int buflen, const int acl_type,
                   const struct nls_table *nls_codepage, int remap)
@@ -2814,7 +3088,7 @@ queryAclRetry:
        pSMB->ParameterCount = pSMB->TotalParameterCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2826,8 +3100,8 @@ queryAclRetry:
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 2))
                /* BB also check enough total bytes returned */
+                if (rc || get_bcc(&pSMBr->hdr) < 2)
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -2844,7 +3118,7 @@ queryAclRetry:
 }
 int
-CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon,
                   const unsigned char *fileName,
                   const char *local_acl, const int buflen,
                   const int acl_type,
@@ -2908,7 +3182,7 @@ setAclRetry:
        pSMB->ParameterCount = cpu_to_le16(params);
        pSMB->TotalParameterCount = pSMB->ParameterCount;
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2924,7 +3198,7 @@ setACLerrorExit:
 /* BB fix tabs in this function FIXME BB */
 int
-CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
+CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon,
               const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
 {
        int rc = 0;
@@ -2966,7 +3240,7 @@ GetExtAttrRetry:
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS);
        pSMB->Pad = 0;
        pSMB->Fid = netfid;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->t2.ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2976,8 +3250,8 @@ GetExtAttrRetry:
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 2))
                /* BB also check enough total bytes returned */
+                if (rc || get_bcc(&pSMBr->hdr) < 2)
                        /* If rc should we check for EOPNOSUPP and
                           disable the srvino flag? or in caller? */
                        rc = -EIO;      /* bad smb */
@@ -3017,7 +3291,7 @@ GetExtAttrOut:
 */
 static int
 smb_init_nttransact(const __u16 sub_command, const int setup_count,
-                   const int parm_len, struct cifsTconInfo *tcon,
+                   const int parm_len, struct cifs_tcon *tcon,
                   void **ret_buf)
 {
        int rc;
@@ -3052,6 +3326,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
        char *end_of_smb;
        __u32 data_count, data_offset, parm_count, parm_offset;
        struct smb_com_ntransact_rsp *pSMBr;
+        u16 bcc;
        *pdatalen = 0;
        *pparmlen = 0;
@@ -3061,8 +3336,8 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
        pSMBr = (struct smb_com_ntransact_rsp *)buf;
-        /* ByteCount was converted from little endian in SendReceive */
+        bcc = get_bcc(&pSMBr->hdr);
-        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+        end_of_smb = 2 /* sizeof byte count */ + bcc +
                        (char *)&pSMBr->ByteCount;
        data_offset = le32_to_cpu(pSMBr->DataOffset);
@@ -3088,7 +3363,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
                        *ppdata, data_count, (data_count + *ppdata),
                        end_of_smb, pSMBr);
                return -EINVAL;
-        } else if (parm_count + data_count > pSMBr->ByteCount) {
+        } else if (parm_count + data_count > bcc) {
                cFYI(1, "parm count and data count larger than SMB");
                return -EINVAL;
        }
@@ -3099,7 +3374,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
-CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
+CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
                  struct cifs_ntsd **acl_inf, __u32 *pbuflen)
 {
        int rc = 0;
@@ -3124,9 +3399,9 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
                                     CIFS_ACL_DACL);
        pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
-        pSMB->hdr.smb_buf_length += 11;
+        inc_rfc1001_len(pSMB, 11);
        iov[0].iov_base = (char *)pSMB;
-        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+        iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
                         0);
@@ -3191,7 +3466,7 @@ qsec_out:
 }
 int
-CIFSSMBSetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
+CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
                        struct cifs_ntsd *pntsd, __u32 acllen)
 {
        __u16 byte_count, param_count, data_count, param_offset, data_offset;
@@ -3235,10 +3510,9 @@ setCifsAclRetry:
                memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
                        (char *) pntsd,
                        acllen);
-                pSMB->hdr.smb_buf_length += (byte_count + data_count);
+                inc_rfc1001_len(pSMB, byte_count + data_count);
        } else
-                pSMB->hdr.smb_buf_length += byte_count;
+                inc_rfc1001_len(pSMB, byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3258,7 +3532,7 @@ setCifsAclRetry:
 /* Legacy Query Path Information call for lookup to old servers such
   as Win9x/WinME */
-int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
+int SMBQueryInformation(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *pFinfo,
                        const struct nls_table *nls_codepage, int remap)
@@ -3289,7 +3563,7 @@ QInfRetry:
        }
        pSMB->BufferFormat = 0x04;
        name_len++; /* account for buffer type byte */
-        pSMB->hdr.smb_buf_length += (__u16) name_len;
+        inc_rfc1001_len(pSMB, (__u16)name_len);
        pSMB->ByteCount = cpu_to_le16(name_len);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3326,7 +3600,7 @@ QInfRetry:
 }
 int
-CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon,
                 u16 netfid, FILE_ALL_INFO *pFindData)
 {
        struct smb_t2_qfi_req *pSMB = NULL;
@@ -3364,7 +3638,7 @@ QFileInfoRetry:
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
        pSMB->Pad = 0;
        pSMB->Fid = netfid;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3375,7 +3649,7 @@ QFileInfoRetry:
                if (rc) /* BB add auto retry on EOPNOTSUPP? */
                        rc = -EIO;
-                else if (pSMBr->ByteCount < 40)
+                else if (get_bcc(&pSMBr->hdr) < 40)
                        rc = -EIO;      /* bad smb */
                else if (pFindData) {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3393,7 +3667,7 @@ QFileInfoRetry:
 }
 int
-CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon,
                 const unsigned char *searchName,
                 FILE_ALL_INFO *pFindData,
                 int legacy /* old style infolevel */,
@@ -3451,7 +3725,7 @@ QPathInfoRetry:
        else
                pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3463,9 +3737,9 @@ QPathInfoRetry:
                if (rc) /* BB add auto retry on EOPNOTSUPP? */
                        rc = -EIO;
-                else if (!legacy && (pSMBr->ByteCount < 40))
+                else if (!legacy && get_bcc(&pSMBr->hdr) < 40)
                        rc = -EIO;      /* bad smb */
-                else if (legacy && (pSMBr->ByteCount < 24))
+                else if (legacy && get_bcc(&pSMBr->hdr) < 24)
                        rc = -EIO;  /* 24 or 26 expected but we do not read
                                        last field */
                else if (pFindData) {
@@ -3494,7 +3768,7 @@ QPathInfoRetry:
 }
 int
-CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon,
                 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
 {
        struct smb_t2_qfi_req *pSMB = NULL;
@@ -3532,7 +3806,7 @@ UnixQFileInfoRetry:
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
        pSMB->Pad = 0;
        pSMB->Fid = netfid;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3541,7 +3815,7 @@ UnixQFileInfoRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+                if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
                                   "Unix Extensions can be disabled on mount "
                                   "by specifying the nosfu mount option.");
@@ -3563,7 +3837,7 @@ UnixQFileInfoRetry:
 }
 int
-CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQPathInfo(const int xid, struct cifs_tcon *tcon,
                     const unsigned char *searchName,
                     FILE_UNIX_BASIC_INFO *pFindData,
                     const struct nls_table *nls_codepage, int remap)
@@ -3617,7 +3891,7 @@ UnixQPathInfoRetry:
        pSMB->ParameterCount = pSMB->TotalParameterCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3627,7 +3901,7 @@ UnixQPathInfoRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+                if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
                                   "Unix Extensions can be disabled on mount "
                                   "by specifying the nosfu mount option.");
@@ -3649,7 +3923,7 @@ UnixQPathInfoRetry:
 /* xid, tcon, searchName and codepage are input parms, rest are returned */
 int
-CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
+CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
              const char *searchName,
              const struct nls_table *nls_codepage,
              __u16 *pnetfid,
@@ -3731,7 +4005,7 @@ findFirstRetry:
        /* BB what should we set StorageType to? Does it matter? BB */
        pSMB->SearchStorageType = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3797,7 +4071,7 @@ findFirstRetry:
        return rc;
 }
-int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
+int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
                 __u16 searchHandle, struct cifs_search_info *psrch_inf)
 {
        TRANSACTION2_FNEXT_REQ *pSMB = NULL;
@@ -3860,7 +4134,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
        byte_count = params + 1 /* pad */ ;
        pSMB->TotalParameterCount = cpu_to_le16(params);
        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3935,7 +4209,7 @@ FNext2_err_exit:
 }
 int
-CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
+CIFSFindClose(const int xid, struct cifs_tcon *tcon,
              const __u16 searchHandle)
 {
        int rc = 0;
@@ -3967,7 +4241,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
+CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
                      const unsigned char *searchName,
                      __u64 *inode_number,
                      const struct nls_table *nls_codepage, int remap)
@@ -4022,7 +4296,7 @@ GetInodeNumberRetry:
        pSMB->ParameterCount = pSMB->TotalParameterCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4032,8 +4306,8 @@ GetInodeNumberRetry:
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 2))
                /* BB also check enough total bytes returned */
+                if (rc || get_bcc(&pSMBr->hdr) < 2)
                        /* If rc should we check for EOPNOSUPP and
                        disable the srvino flag? or in caller? */
                        rc = -EIO;      /* bad smb */
@@ -4169,7 +4443,7 @@ parse_DFS_referrals_exit:
 }
 int
-CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
+CIFSGetDFSRefer(const int xid, struct cifs_ses *ses,
                const unsigned char *searchName,
                struct dfs_info3_param **target_nodes,
                unsigned int *num_of_nodes,
@@ -4218,7 +4492,7 @@ getDFSRetry:
        }
        if (ses->server) {
-                if (ses->server->secMode &
+                if (ses->server->sec_mode &
                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        }
@@ -4246,7 +4520,7 @@ getDFSRetry:
        pSMB->ParameterCount = cpu_to_le16(params);
        pSMB->TotalParameterCount = pSMB->ParameterCount;
        pSMB->MaxReferralLevel = cpu_to_le16(3);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
@@ -4258,13 +4532,13 @@ getDFSRetry:
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
        /* BB Also check if enough total bytes returned? */
-        if (rc || (pSMBr->ByteCount < 17)) {
+        if (rc || get_bcc(&pSMBr->hdr) < 17) {
                rc = -EIO;      /* bad smb */
                goto GetDFSRefExit;
        }
        cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
-                                pSMBr->ByteCount,
+                                get_bcc(&pSMBr->hdr),
                                le16_to_cpu(pSMBr->t2.DataOffset));
        /* parse returned result into more usable form */
@@ -4283,7 +4557,7 @@ GetDFSRefExit:
 /* Query File System Info such as free space to old servers such as Win 9x */
 int
-SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
+SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData)
 {
 /* level 0x01 SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4320,7 +4594,7 @@ oldQFSInfoRetry:
        pSMB->Reserved3 = 0;
        pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
        pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4330,12 +4604,12 @@ oldQFSInfoRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 18))
+                if (rc || get_bcc(&pSMBr->hdr) < 18)
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
                        cFYI(1, "qfsinf resp BCC: %d  Offset %d",
-                                 pSMBr->ByteCount, data_offset);
+                                 get_bcc(&pSMBr->hdr), data_offset);
                        response_data = (FILE_SYSTEM_ALLOC_INFO *)
                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4362,7 +4636,7 @@ oldQFSInfoRetry:
 }
 int
-CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
+CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData)
 {
 /* level 0x103 SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4399,7 +4673,7 @@ QFSInfoRetry:
        pSMB->Reserved3 = 0;
        pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4409,7 +4683,7 @@ QFSInfoRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 24))
+                if (rc || get_bcc(&pSMBr->hdr) < 24)
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4441,7 +4715,7 @@ QFSInfoRetry:
 }
 int
-CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSAttributeInfo(const int xid, struct cifs_tcon *tcon)
 {
 /* level 0x105  SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4479,7 +4753,7 @@ QFSAttributeRetry:
        pSMB->Reserved3 = 0;
        pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4489,7 +4763,7 @@ QFSAttributeRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 13)) {
+                if (rc || get_bcc(&pSMBr->hdr) < 13) {
                        /* BB also check if enough bytes returned */
                        rc = -EIO;      /* bad smb */
                } else {
@@ -4511,7 +4785,7 @@ QFSAttributeRetry:
 }
 int
-CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon)
 {
 /* level 0x104 SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4550,7 +4824,7 @@ QFSDeviceRetry:
        pSMB->Reserved3 = 0;
        pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4560,7 +4834,8 @@ QFSDeviceRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < sizeof(FILE_SYSTEM_DEVICE_INFO)))
+                if (rc || get_bcc(&pSMBr->hdr) <
+                          sizeof(FILE_SYSTEM_DEVICE_INFO))
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4581,7 +4856,7 @@ QFSDeviceRetry:
 }
 int
-CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon)
 {
 /* level 0x200  SMB_QUERY_CIFS_UNIX_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4619,7 +4894,7 @@ QFSUnixRetry:
        pSMB->Reserved3 = 0;
        pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4629,7 +4904,7 @@ QFSUnixRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 13)) {
+                if (rc || get_bcc(&pSMBr->hdr) < 13) {
                        rc = -EIO;      /* bad smb */
                } else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4651,7 +4926,7 @@ QFSUnixRetry:
 }
 int
-CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
+CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon, __u64 cap)
 {
 /* level 0x200  SMB_SET_CIFS_UNIX_INFO */
        TRANSACTION2_SETFSI_REQ *pSMB = NULL;
@@ -4702,7 +4977,7 @@ SETFSUnixRetry:
        pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION);
        pSMB->ClientUnixCap = cpu_to_le64(cap);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4725,7 +5000,7 @@ SETFSUnixRetry:
 int
-CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon,
                   struct kstatfs *FSData)
 {
 /* level 0x201  SMB_QUERY_CIFS_POSIX_INFO */
@@ -4764,7 +5039,7 @@ QFSPosixRetry:
        pSMB->Reserved3 = 0;
        pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO);
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4774,7 +5049,7 @@ QFSPosixRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 13)) {
+                if (rc || get_bcc(&pSMBr->hdr) < 13) {
                        rc = -EIO;      /* bad smb */
                } else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4818,7 +5093,7 @@ QFSPosixRetry:
   in Samba which this routine can run into */
 int
-CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon, const char *fileName,
              __u64 size, bool SetAllocation,
              const struct nls_table *nls_codepage, int remap)
 {
@@ -4890,7 +5165,7 @@ SetEOFRetry:
        pSMB->ParameterCount = cpu_to_le16(params);
        pSMB->TotalParameterCount = pSMB->ParameterCount;
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        parm_data->FileSize = cpu_to_le64(size);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4907,7 +5182,7 @@ SetEOFRetry:
 }
 int
-CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
+CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon, __u64 size,
                   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -4969,7 +5244,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
                                cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
        }
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc) {
@@ -4989,7 +5264,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
   time and resort to the original setpathinfo level which takes the ancient
   DOS time format with 2 second granularity */
 int
-CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
                    const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -5037,7 +5312,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        else
                pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
@@ -5051,7 +5326,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon,
                          bool delete_file, __u16 fid, __u32 pid_of_opener)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -5096,7 +5371,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
        pSMB->Fid = fid;
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        *data_offset = delete_file ? 1 : 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
@@ -5107,7 +5382,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon,
                   const char *fileName, const FILE_BASIC_INFO *data,
                   const struct nls_table *nls_codepage, int remap)
 {
@@ -5169,7 +5444,7 @@ SetTimesRetry:
        else
                pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -5191,7 +5466,7 @@ SetTimesRetry:
          handling it anyway and NT4 was what we thought it would be needed for
          Do not delete it until we prove whether needed for Win9x though */
 int
-CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon, char *fileName,
                __u16 dos_attrs, const struct nls_table *nls_codepage)
 {
        SETATTR_REQ *pSMB = NULL;
@@ -5221,7 +5496,7 @@ SetAttrLgcyRetry:
        }
        pSMB->attr = cpu_to_le16(dos_attrs);
        pSMB->BufferFormat = 0x04;
-        pSMB->hdr.smb_buf_length += name_len + 1;
+        inc_rfc1001_len(pSMB, name_len + 1);
        pSMB->ByteCount = cpu_to_le16(name_len + 1);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -5279,7 +5554,7 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
 }
 int
-CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
                       const struct cifs_unix_set_info_args *args,
                       u16 fid, u32 pid_of_opener)
 {
@@ -5326,7 +5601,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        pSMB->Fid = fid;
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        cifs_fill_unix_set_info(data_offset, args);
@@ -5342,7 +5617,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *tcon, char *fileName,
                       const struct cifs_unix_set_info_args *args,
                       const struct nls_table *nls_codepage, int remap)
 {
@@ -5402,7 +5677,7 @@ setPermsRetry:
        pSMB->TotalDataCount = pSMB->DataCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        cifs_fill_unix_set_info(data_offset, args);
@@ -5418,79 +5693,6 @@ setPermsRetry:
        return rc;
 }
-int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
-                  const int notify_subdirs, const __u16 netfid,
-                  __u32 filter, struct file *pfile, int multishot,
-                  const struct nls_table *nls_codepage)
-{
-        int rc = 0;
-        struct smb_com_transaction_change_notify_req *pSMB = NULL;
-        struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
-        struct dir_notify_req *dnotify_req;
-        int bytes_returned;
-        cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
-        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
-                      (void **) &pSMBr);
-        if (rc)
-                return rc;
-        pSMB->TotalParameterCount = 0 ;
-        pSMB->TotalDataCount = 0;
-        pSMB->MaxParameterCount = cpu_to_le32(2);
-        /* BB find exact data count max from sess structure BB */
-        pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-                                             MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-        pSMB->MaxSetupCount = 4;
-        pSMB->Reserved = 0;
-        pSMB->ParameterOffset = 0;
-        pSMB->DataCount = 0;
-        pSMB->DataOffset = 0;
-        pSMB->SetupCount = 4; /* single byte does not need le conversion */
-        pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
-        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        if (notify_subdirs)
-                pSMB->WatchTree = 1; /* one byte - no le conversion needed */
-        pSMB->Reserved2 = 0;
-        pSMB->CompletionFilter = cpu_to_le32(filter);
-        pSMB->Fid = netfid; /* file handle always le */
-        pSMB->ByteCount = 0;
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                         (struct smb_hdr *)pSMBr, &bytes_returned,
-                         CIFS_ASYNC_OP);
-        if (rc) {
-                cFYI(1, "Error in Notify = %d", rc);
-        } else {
-                /* Add file to outstanding requests */
-                /* BB change to kmem cache alloc */
-                dnotify_req = kmalloc(
-                                                sizeof(struct dir_notify_req),
-                                                 GFP_KERNEL);
-                if (dnotify_req) {
-                        dnotify_req->Pid = pSMB->hdr.Pid;
-                        dnotify_req->PidHigh = pSMB->hdr.PidHigh;
-                        dnotify_req->Mid = pSMB->hdr.Mid;
-                        dnotify_req->Tid = pSMB->hdr.Tid;
-                        dnotify_req->Uid = pSMB->hdr.Uid;
-                        dnotify_req->netfid = netfid;
-                        dnotify_req->pfile = pfile;
-                        dnotify_req->filter = filter;
-                        dnotify_req->multishot = multishot;
-                        spin_lock(&GlobalMid_Lock);
-                        list_add_tail(&dnotify_req->lhead,
-                                        &GlobalDnotifyReqList);
-                        spin_unlock(&GlobalMid_Lock);
-                } else
-                        rc = -ENOMEM;
-        }
-        cifs_buf_release(pSMB);
-        return rc;
-}
 #ifdef CONFIG_CIFS_XATTR
 /*
 * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
@@ -5502,7 +5704,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 * the data isn't copied to it, but the length is returned.
 */
 ssize_t
-CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
                const unsigned char *searchName, const unsigned char *ea_name,
                char *EAData, size_t buf_size,
                const struct nls_table *nls_codepage, int remap)
@@ -5560,7 +5762,7 @@ QAllEAsRetry:
        pSMB->ParameterCount = pSMB->TotalParameterCount;
        pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -5576,7 +5778,7 @@ QAllEAsRetry:
        of these trans2 responses */
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-        if (rc || (pSMBr->ByteCount < 4)) {
+        if (rc || get_bcc(&pSMBr->hdr) < 4) {
                rc = -EIO;      /* bad smb */
                goto QAllEAsOut;
        }
@@ -5683,7 +5885,7 @@ QAllEAsOut:
 }
 int
-CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, const char *fileName,
             const char *ea_name, const void *ea_value,
             const __u16 ea_value_len, const struct nls_table *nls_codepage,
             int remap)
@@ -5773,7 +5975,7 @@ SetEARetry:
        pSMB->ParameterCount = cpu_to_le16(params);
        pSMB->TotalParameterCount = pSMB->ParameterCount;
        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
+        inc_rfc1001_len(pSMB, byte_count);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -5787,5 +5989,99 @@ SetEARetry:
        return rc;
 }
 #endif
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */
+/*
+ *      Years ago the kernel added a "dnotify" function for Samba server,
+ *      to allow network clients (such as Windows) to display updated
+ *      lists of files in directory listings automatically when
+ *      files are added by one user when another user has the
+ *      same directory open on their desktop.  The Linux cifs kernel
+ *      client hooked into the kernel side of this interface for
+ *      the same reason, but ironically when the VFS moved from
+ *      "dnotify" to "inotify" it became harder to plug in Linux
+ *      network file system clients (the most obvious use case
+ *      for notify interfaces is when multiple users can update
+ *      the contents of the same directory - exactly what network
+ *      file systems can do) although the server (Samba) could
+ *      still use it.  For the short term we leave the worker
+ *      function ifdeffed out (below) until inotify is fixed
+ *      in the VFS to make it easier to plug in network file
+ *      system clients.  If inotify turns out to be permanently
+ *      incompatible for network fs clients, we could instead simply
+ *      expose this config flag by adding a future cifs (and smb2) notify ioctl.
+ */
+int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
+                  const int notify_subdirs, const __u16 netfid,
+                  __u32 filter, struct file *pfile, int multishot,
+                  const struct nls_table *nls_codepage)
+{
+        int rc = 0;
+        struct smb_com_transaction_change_notify_req *pSMB = NULL;
+        struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
+        struct dir_notify_req *dnotify_req;
+        int bytes_returned;
+        cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
+        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+                      (void **) &pSMBr);
+        if (rc)
+                return rc;
+        pSMB->TotalParameterCount = 0 ;
+        pSMB->TotalDataCount = 0;
+        pSMB->MaxParameterCount = cpu_to_le32(2);
+        /* BB find exact data count max from sess structure BB */
+        pSMB->MaxDataCount = 0; /* same in little endian or be */
+/* BB VERIFY verify which is correct for above BB */
+        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+                                             MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+        pSMB->MaxSetupCount = 4;
+        pSMB->Reserved = 0;
+        pSMB->ParameterOffset = 0;
+        pSMB->DataCount = 0;
+        pSMB->DataOffset = 0;
+        pSMB->SetupCount = 4; /* single byte does not need le conversion */
+        pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
+        pSMB->ParameterCount = pSMB->TotalParameterCount;
+        if (notify_subdirs)
+                pSMB->WatchTree = 1; /* one byte - no le conversion needed */
+        pSMB->Reserved2 = 0;
+        pSMB->CompletionFilter = cpu_to_le32(filter);
+        pSMB->Fid = netfid; /* file handle always le */
+        pSMB->ByteCount = 0;
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                         (struct smb_hdr *)pSMBr, &bytes_returned,
+                         CIFS_ASYNC_OP);
+        if (rc) {
+                cFYI(1, "Error in Notify = %d", rc);
+        } else {
+                /* Add file to outstanding requests */
+                /* BB change to kmem cache alloc */
+                dnotify_req = kmalloc(
+                                                sizeof(struct dir_notify_req),
+                                                 GFP_KERNEL);
+                if (dnotify_req) {
+                        dnotify_req->Pid = pSMB->hdr.Pid;
+                        dnotify_req->PidHigh = pSMB->hdr.PidHigh;
+                        dnotify_req->Mid = pSMB->hdr.Mid;
+                        dnotify_req->Tid = pSMB->hdr.Tid;
+                        dnotify_req->Uid = pSMB->hdr.Uid;
+                        dnotify_req->netfid = netfid;
+                        dnotify_req->pfile = pfile;
+                        dnotify_req->filter = filter;
+                        dnotify_req->multishot = multishot;
+                        spin_lock(&GlobalMid_Lock);
+                        list_add_tail(&dnotify_req->lhead,
+                                        &GlobalDnotifyReqList);
+                        spin_unlock(&GlobalMid_Lock);
+                } else
+                        rc = -ENOMEM;
+        }
+        cifs_buf_release(pSMB);
+        return rc;
+}
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 277262a8e82f..6d88b82537c3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -57,61 +57,6 @@
 extern mempool_t *cifs_req_poolp;
-struct smb_vol {
-        char *username;
-        char *password;
-        char *domainname;
-        char *UNC;
-        char *UNCip;
-        char *iocharset;  /* local code page for mapping to and from Unicode */
-        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
-        uid_t cred_uid;
-        uid_t linux_uid;
-        gid_t linux_gid;
-        mode_t file_mode;
-        mode_t dir_mode;
-        unsigned secFlg;
-        bool retry:1;
-        bool intr:1;
-        bool setuids:1;
-        bool override_uid:1;
-        bool override_gid:1;
-        bool dynperm:1;
-        bool noperm:1;
-        bool no_psx_acl:1; /* set if posix acl support should be disabled */
-        bool cifs_acl:1;
-        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
-        bool server_ino:1; /* use inode numbers from server ie UniqueId */
-        bool direct_io:1;
-        bool strict_io:1; /* strict cache behavior */
-        bool remap:1;      /* set to remap seven reserved chars in filenames */
-        bool posix_paths:1; /* unset to not ask for posix pathnames. */
-        bool no_linux_ext:1;
-        bool sfu_emul:1;
-        bool nullauth:1;   /* attempt to authenticate with null user */
-        bool nocase:1;     /* request case insensitive filenames */
-        bool nobrl:1;      /* disable sending byte range locks to srv */
-        bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
-        bool seal:1;       /* request transport encryption on share */
-        bool nodfs:1;      /* Do not request DFS, even if available */
-        bool local_lease:1; /* check leases only on local system, not remote */
-        bool noblocksnd:1;
-        bool noautotune:1;
-        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
-        bool fsc:1;     /* enable fscache */
-        bool mfsymlinks:1; /* use Minshall+French Symlinks */
-        bool multiuser:1;
-        unsigned int rsize;
-        unsigned int wsize;
-        bool sockopt_tcp_nodelay:1;
-        unsigned short int port;
-        unsigned long actimeo; /* attribute cache timeout (jiffies) */
-        char *prepath;
-        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
-        struct nls_table *local_nls;
-};
 /* FIXME: should these be tunable? */
 #define TLINK_ERROR_EXPIRE      (1 * HZ)
 #define TLINK_IDLE_EXPIRE       (600 * HZ)
@@ -134,9 +79,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
 {
        int rc = 0;
        struct list_head *tmp, *tmp2;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct mid_q_entry *mid_entry;
+        struct list_head retry_list;
        spin_lock(&GlobalMid_Lock);
        if (server->tcpStatus == CifsExiting) {
@@ -156,11 +102,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
        cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &server->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+                ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
                ses->need_reconnect = true;
                ses->ipc_tid = 0;
                list_for_each(tmp2, &ses->tcon_list) {
-                        tcon = list_entry(tmp2, struct cifsTconInfo, tcon_list);
+                        tcon = list_entry(tmp2, struct cifs_tcon, tcon_list);
                        tcon->need_reconnect = true;
                }
        }
@@ -188,16 +134,23 @@ cifs_reconnect(struct TCP_Server_Info *server)
        mutex_unlock(&server->srv_mutex);
        /* mark submitted MIDs for retry and issue callback */
-        cFYI(1, "%s: issuing mid callbacks", __func__);
+        INIT_LIST_HEAD(&retry_list);
+        cFYI(1, "%s: moving mids to private list", __func__);
        spin_lock(&GlobalMid_Lock);
        list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                if (mid_entry->midState == MID_REQUEST_SUBMITTED)
                        mid_entry->midState = MID_RETRY_NEEDED;
+                list_move(&mid_entry->qhead, &retry_list);
+        }
+        spin_unlock(&GlobalMid_Lock);
+        cFYI(1, "%s: issuing mid callbacks", __func__);
+        list_for_each_safe(tmp, tmp2, &retry_list) {
+                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                list_del_init(&mid_entry->qhead);
                mid_entry->callback(mid_entry);
        }
-        spin_unlock(&GlobalMid_Lock);
        while (server->tcpStatus == CifsNeedReconnect) {
                try_to_freeze();
@@ -316,19 +269,19 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
        /* fix up the BCC */
-        byte_count = get_bcc_le(pTargetSMB);
+        byte_count = get_bcc(pTargetSMB);
        byte_count += total_in_buf2;
        /* is the result too big for the field? */
        if (byte_count > USHRT_MAX)
                return -EPROTO;
-        put_bcc_le(byte_count, pTargetSMB);
+        put_bcc(byte_count, pTargetSMB);
-        byte_count = pTargetSMB->smb_buf_length;
+        byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
        byte_count += total_in_buf2;
        /* don't allow buffer to overflow */
        if (byte_count > CIFSMaxBufSize)
                return -ENOBUFS;
-        pTargetSMB->smb_buf_length = byte_count;
+        pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
        memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
@@ -495,8 +448,7 @@ incomplete_rcv:
                /* Note that FC 1001 length is big endian on the wire,
                but we convert it here so it is always manipulated
                as host byte order */
-                pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
+                pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
-                smb_buffer->smb_buf_length = pdu_length;
                cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
@@ -672,12 +624,12 @@ multi_t2_fnd:
                        mid_entry->when_received = jiffies;
 #endif
                        list_del_init(&mid_entry->qhead);
-                        mid_entry->callback(mid_entry);
                        break;
                }
                spin_unlock(&GlobalMid_Lock);
                if (mid_entry != NULL) {
+                        mid_entry->callback(mid_entry);
                        /* Was previous buf put in mpx struct for multi-rsp? */
                        if (!isMultiRsp) {
                                /* smb buffer will be freed by user thread */
@@ -735,21 +687,31 @@ multi_t2_fnd:
                sock_release(csocket);
                server->ssocket = NULL;
        }
-        /* buffer usuallly freed in free_mid - need to free it here on exit */
+        /* buffer usually freed in free_mid - need to free it here on exit */
        cifs_buf_release(bigbuf);
        if (smallbuf) /* no sense logging a debug message if NULL */
                cifs_small_buf_release(smallbuf);
        if (!list_empty(&server->pending_mid_q)) {
+                struct list_head dispose_list;
+                INIT_LIST_HEAD(&dispose_list);
                spin_lock(&GlobalMid_Lock);
                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        cFYI(1, "Clearing Mid 0x%x - issuing callback",
+                        cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
-                                         mid_entry->mid);
+                        mid_entry->midState = MID_SHUTDOWN;
+                        list_move(&mid_entry->qhead, &dispose_list);
+                }
+                spin_unlock(&GlobalMid_Lock);
+                /* now walk dispose list and issue callbacks */
+                list_for_each_safe(tmp, tmp2, &dispose_list) {
+                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                        cFYI(1, "Callback mid 0x%x", mid_entry->mid);
                        list_del_init(&mid_entry->qhead);
                        mid_entry->callback(mid_entry);
                }
-                spin_unlock(&GlobalMid_Lock);
                /* 1/8th of sec is more than enough time for them to exit */
                msleep(125);
        }
@@ -818,10 +780,11 @@ extract_hostname(const char *unc)
 }
 static int
-cifs_parse_mount_options(char *options, const char *devname,
+cifs_parse_mount_options(const char *mountdata, const char *devname,
                         struct smb_vol *vol)
 {
        char *value, *data, *end;
+        char *mountdata_copy, *options;
        unsigned int  temp_len, i, j;
        char separator[2];
        short int override_uid = -1;
@@ -861,9 +824,14 @@ cifs_parse_mount_options(char *options, const char *devname,
        vol->actimeo = CIFS_DEF_ACTIMEO;
-        if (!options)
+        if (!mountdata)
-                return 1;
+                goto cifs_parse_mount_err;
+        mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL);
+        if (!mountdata_copy)
+                goto cifs_parse_mount_err;
+        options = mountdata_copy;
        end = options + strlen(options);
        if (strncmp(options, "sep=", 4) == 0) {
                if (options[4] != 0) {
@@ -889,17 +857,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value) {
                                printk(KERN_WARNING
                                       "CIFS: invalid or missing username\n");
-                                return 1;       /* needs_arg; */
+                                goto cifs_parse_mount_err;
                        } else if (!*value) {
                                /* null user, ie anonymous, authentication */
                                vol->nullauth = 1;
                        }
                        if (strnlen(value, MAX_USERNAME_SIZE) <
                                                MAX_USERNAME_SIZE) {
-                                vol->username = value;
+                                vol->username = kstrdup(value, GFP_KERNEL);
+                                if (!vol->username) {
+                                        printk(KERN_WARNING "CIFS: no memory "
+                                                            "for username\n");
+                                        goto cifs_parse_mount_err;
+                                }
                        } else {
                                printk(KERN_WARNING "CIFS: username too long\n");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                } else if (strnicmp(data, "pass", 4) == 0) {
                        if (!value) {
@@ -963,7 +936,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                if (vol->password == NULL) {
                                        printk(KERN_WARNING "CIFS: no memory "
                                                            "for password\n");
-                                        return 1;
+                                        goto cifs_parse_mount_err;
                                }
                                for (i = 0, j = 0; i < temp_len; i++, j++) {
                                        vol->password[j] = value[i];
@@ -979,7 +952,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                if (vol->password == NULL) {
                                        printk(KERN_WARNING "CIFS: no memory "
                                                            "for password\n");
-                                        return 1;
+                                        goto cifs_parse_mount_err;
                                }
                                strcpy(vol->password, value);
                        }
@@ -989,11 +962,16 @@ cifs_parse_mount_options(char *options, const char *devname,
                                vol->UNCip = NULL;
                        } else if (strnlen(value, INET6_ADDRSTRLEN) <
                                                        INET6_ADDRSTRLEN) {
-                                vol->UNCip = value;
+                                vol->UNCip = kstrdup(value, GFP_KERNEL);
+                                if (!vol->UNCip) {
+                                        printk(KERN_WARNING "CIFS: no memory "
+                                                            "for UNC IP\n");
+                                        goto cifs_parse_mount_err;
+                                }
                        } else {
                                printk(KERN_WARNING "CIFS: ip address "
                                                    "too long\n");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                } else if (strnicmp(data, "sec", 3) == 0) {
                        if (!value || !*value) {
@@ -1006,7 +984,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                /* vol->secFlg |= CIFSSEC_MUST_SEAL |
                                        CIFSSEC_MAY_KRB5; */
                                cERROR(1, "Krb5 cifs privacy not supported");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
                        } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
@@ -1036,7 +1014,16 @@ cifs_parse_mount_options(char *options, const char *devname,
                                vol->nullauth = 1;
                        } else {
                                cERROR(1, "bad security option: %s", value);
-                                return 1;
+                                goto cifs_parse_mount_err;
+                        }
+                } else if (strnicmp(data, "vers", 3) == 0) {
+                        if (!value || !*value) {
+                                cERROR(1, "no protocol version specified"
+                                          " after vers= mount option");
+                        } else if ((strnicmp(value, "cifs", 4) == 0) ||
+                                   (strnicmp(value, "1", 1) == 0)) {
+                                /* this is the default */
+                                continue;
                        }
                } else if ((strnicmp(data, "unc", 3) == 0)
                           || (strnicmp(data, "target", 6) == 0)
@@ -1044,12 +1031,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value || !*value) {
                                printk(KERN_WARNING "CIFS: invalid path to "
                                                    "network resource\n");
-                                return 1;       /* needs_arg; */
+                                goto cifs_parse_mount_err;
                        }
                        if ((temp_len = strnlen(value, 300)) < 300) {
                                vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
                                if (vol->UNC == NULL)
-                                        return 1;
+                                        goto cifs_parse_mount_err;
                                strcpy(vol->UNC, value);
                                if (strncmp(vol->UNC, "//", 2) == 0) {
                                        vol->UNC[0] = '\\';
@@ -1058,27 +1045,32 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        printk(KERN_WARNING
                                               "CIFS: UNC Path does not begin "
                                               "with // or \\\\ \n");
-                                        return 1;
+                                        goto cifs_parse_mount_err;
                                }
                        } else {
                                printk(KERN_WARNING "CIFS: UNC name too long\n");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                } else if ((strnicmp(data, "domain", 3) == 0)
                           || (strnicmp(data, "workgroup", 5) == 0)) {
                        if (!value || !*value) {
                                printk(KERN_WARNING "CIFS: invalid domain name\n");
-                                return 1;       /* needs_arg; */
+                                goto cifs_parse_mount_err;
                        }
                        /* BB are there cases in which a comma can be valid in
                        a domain name and need special handling? */
                        if (strnlen(value, 256) < 256) {
-                                vol->domainname = value;
+                                vol->domainname = kstrdup(value, GFP_KERNEL);
+                                if (!vol->domainname) {
+                                        printk(KERN_WARNING "CIFS: no memory "
+                                                            "for domainname\n");
+                                        goto cifs_parse_mount_err;
+                                }
                                cFYI(1, "Domain name set");
                        } else {
                                printk(KERN_WARNING "CIFS: domain name too "
                                                    "long\n");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                } else if (strnicmp(data, "srcaddr", 7) == 0) {
                        vol->srcaddr.ss_family = AF_UNSPEC;
@@ -1086,7 +1078,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value || !*value) {
                                printk(KERN_WARNING "CIFS: srcaddr value"
                                       " not specified.\n");
-                                return 1;       /* needs_arg; */
+                                goto cifs_parse_mount_err;
                        }
                        i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
                                                 value, strlen(value));
@@ -1094,20 +1086,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                printk(KERN_WARNING "CIFS:  Could not parse"
                                       " srcaddr: %s\n",
                                       value);
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                } else if (strnicmp(data, "prefixpath", 10) == 0) {
                        if (!value || !*value) {
                                printk(KERN_WARNING
                                        "CIFS: invalid path prefix\n");
-                                return 1;       /* needs_argument */
+                                goto cifs_parse_mount_err;
                        }
                        if ((temp_len = strnlen(value, 1024)) < 1024) {
                                if (value[0] != '/')
                                        temp_len++;  /* missing leading slash */
                                vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
                                if (vol->prepath == NULL)
-                                        return 1;
+                                        goto cifs_parse_mount_err;
                                if (value[0] != '/') {
                                        vol->prepath[0] = '/';
                                        strcpy(vol->prepath+1, value);
@@ -1116,24 +1108,33 @@ cifs_parse_mount_options(char *options, const char *devname,
                                cFYI(1, "prefix path %s", vol->prepath);
                        } else {
                                printk(KERN_WARNING "CIFS: prefix too long\n");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                } else if (strnicmp(data, "iocharset", 9) == 0) {
                        if (!value || !*value) {
                                printk(KERN_WARNING "CIFS: invalid iocharset "
                                                    "specified\n");
-                                return 1;       /* needs_arg; */
+                                goto cifs_parse_mount_err;
                        }
                        if (strnlen(value, 65) < 65) {
-                                if (strnicmp(value, "default", 7))
+                                if (strnicmp(value, "default", 7)) {
-                                        vol->iocharset = value;
+                                        vol->iocharset = kstrdup(value,
+                                                                 GFP_KERNEL);
+                                        if (!vol->iocharset) {
+                                                printk(KERN_WARNING "CIFS: no "
+                                                                   "memory for"
+                                                                   "charset\n");
+                                                goto cifs_parse_mount_err;
+                                        }
+                                }
                                /* if iocharset not set then load_nls_default
                                   is used by caller */
                                cFYI(1, "iocharset set to %s", value);
                        } else {
                                printk(KERN_WARNING "CIFS: iocharset name "
                                                    "too long.\n");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                } else if (!strnicmp(data, "uid", 3) && value && *value) {
                        vol->linux_uid = simple_strtoul(value, &value, 0);
@@ -1246,7 +1247,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                if (vol->actimeo > CIFS_MAX_ACTIMEO) {
                                        cERROR(1, "CIFS: attribute cache"
                                                        "timeout too large");
-                                        return 1;
+                                        goto cifs_parse_mount_err;
                                }
                        }
                } else if (strnicmp(data, "credentials", 4) == 0) {
@@ -1358,6 +1359,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->server_ino = 1;
                } else if (strnicmp(data, "noserverino", 9) == 0) {
                        vol->server_ino = 0;
+                } else if (strnicmp(data, "rwpidforward", 4) == 0) {
+                        vol->rwpidforward = 1;
                } else if (strnicmp(data, "cifsacl", 7) == 0) {
                        vol->cifs_acl = 1;
                } else if (strnicmp(data, "nocifsacl", 9) == 0) {
@@ -1390,7 +1393,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 #ifndef CONFIG_CIFS_FSCACHE
                        cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
                                  "kernel config option set");
-                        return 1;
+                        goto cifs_parse_mount_err;
 #endif
                        vol->fsc = true;
                } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
@@ -1405,12 +1408,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                if (devname == NULL) {
                        printk(KERN_WARNING "CIFS: Missing UNC name for mount "
                                                "target\n");
-                        return 1;
+                        goto cifs_parse_mount_err;
                }
                if ((temp_len = strnlen(devname, 300)) < 300) {
                        vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
                        if (vol->UNC == NULL)
-                                return 1;
+                                goto cifs_parse_mount_err;
                        strcpy(vol->UNC, devname);
                        if (strncmp(vol->UNC, "//", 2) == 0) {
                                vol->UNC[0] = '\\';
@@ -1418,21 +1421,21 @@ cifs_parse_mount_options(char *options, const char *devname,
                        } else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
                                printk(KERN_WARNING "CIFS: UNC Path does not "
                                                    "begin with // or \\\\ \n");
-                                return 1;
+                                goto cifs_parse_mount_err;
                        }
                        value = strpbrk(vol->UNC+2, "/\\");
                        if (value)
                                *value = '\\';
                } else {
                        printk(KERN_WARNING "CIFS: UNC name too long\n");
-                        return 1;
+                        goto cifs_parse_mount_err;
                }
        }
        if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
                cERROR(1, "Multiuser mounts currently require krb5 "
                          "authentication!");
-                return 1;
+                goto cifs_parse_mount_err;
        }
        if (vol->UNCip == NULL)
@@ -1450,7 +1453,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
                                   "specified with no gid= option.\n");
+        kfree(mountdata_copy);
        return 0;
+cifs_parse_mount_err:
+        kfree(mountdata_copy);
+        return 1;
 }
 /** Returns true if srcaddr isn't specified and rhs isn't
@@ -1589,16 +1597,35 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
        /* now check if signing mode is acceptable */
        if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
-            (server->secMode & SECMODE_SIGN_REQUIRED))
+            (server->sec_mode & SECMODE_SIGN_REQUIRED))
                        return false;
        else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
-                 (server->secMode &
+                 (server->sec_mode &
                  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
                        return false;
        return true;
 }
+static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
+                         struct smb_vol *vol)
+{
+        if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+                return 0;
+        if (!match_address(server, addr,
+                           (struct sockaddr *)&vol->srcaddr))
+                return 0;
+        if (!match_port(server, addr))
+                return 0;
+        if (!match_security(server, vol))
+                return 0;
+        return 1;
+}
 static struct TCP_Server_Info *
 cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
@@ -1606,17 +1633,7 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+                if (!match_server(server, addr, vol))
-                        continue;
-                if (!match_address(server, addr,
-                                   (struct sockaddr *)&vol->srcaddr))
-                        continue;
-                if (!match_port(server, addr))
-                        continue;
-                if (!match_security(server, vol))
                        continue;
                ++server->srv_count;
@@ -1810,32 +1827,39 @@ out_err:
        return ERR_PTR(rc);
 }
-static struct cifsSesInfo *
+static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
+{
+        switch (ses->server->secType) {
+        case Kerberos:
+                if (vol->cred_uid != ses->cred_uid)
+                        return 0;
+                break;
+        default:
+                /* anything else takes username/password */
+                if (ses->user_name == NULL)
+                        return 0;
+                if (strncmp(ses->user_name, vol->username,
+                            MAX_USERNAME_SIZE))
+                        return 0;
+                if (strlen(vol->username) != 0 &&
+                    ses->password != NULL &&
+                    strncmp(ses->password,
+                            vol->password ? vol->password : "",
+                            MAX_PASSWORD_SIZE))
+                        return 0;
+        }
+        return 1;
+}
+static struct cifs_ses *
 cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
-                switch (server->secType) {
+                if (!match_session(ses, vol))
-                case Kerberos:
+                        continue;
-                        if (vol->cred_uid != ses->cred_uid)
-                                continue;
-                        break;
-                default:
-                        /* anything else takes username/password */
-                        if (ses->user_name == NULL)
-                                continue;
-                        if (strncmp(ses->user_name, vol->username,
-                                    MAX_USERNAME_SIZE))
-                                continue;
-                        if (strlen(vol->username) != 0 &&
-                            ses->password != NULL &&
-                            strncmp(ses->password,
-                                    vol->password ? vol->password : "",
-                                    MAX_PASSWORD_SIZE))
-                                continue;
-                }
                ++ses->ses_count;
                spin_unlock(&cifs_tcp_ses_lock);
                return ses;
@@ -1845,7 +1869,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 }
 static void
-cifs_put_smb_ses(struct cifsSesInfo *ses)
+cifs_put_smb_ses(struct cifs_ses *ses)
 {
        int xid;
        struct TCP_Server_Info *server = ses->server;
@@ -1871,11 +1895,11 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 static bool warned_on_ntlm;  /* globals init to false automatically */
-static struct cifsSesInfo *
+static struct cifs_ses *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
        int rc = -ENOMEM, xid;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
@@ -1978,20 +2002,26 @@ get_ses_fail:
        return ERR_PTR(rc);
 }
-static struct cifsTconInfo *
+static int match_tcon(struct cifs_tcon *tcon, const char *unc)
-cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
+{
+        if (tcon->tidStatus == CifsExiting)
+                return 0;
+        if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+                return 0;
+        return 1;
+}
+static struct cifs_tcon *
+cifs_find_tcon(struct cifs_ses *ses, const char *unc)
 {
        struct list_head *tmp;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &ses->tcon_list) {
-                tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
+                tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
-                if (tcon->tidStatus == CifsExiting)
+                if (!match_tcon(tcon, unc))
-                        continue;
-                if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
                        continue;
                ++tcon->tc_count;
                spin_unlock(&cifs_tcp_ses_lock);
                return tcon;
@@ -2001,10 +2031,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 }
 static void
-cifs_put_tcon(struct cifsTconInfo *tcon)
+cifs_put_tcon(struct cifs_tcon *tcon)
 {
        int xid;
-        struct cifsSesInfo *ses = tcon->ses;
+        struct cifs_ses *ses = tcon->ses;
        cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
        spin_lock(&cifs_tcp_ses_lock);
@@ -2025,11 +2055,11 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        cifs_put_smb_ses(ses);
 }
-static struct cifsTconInfo *
+static struct cifs_tcon *
-cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 {
        int rc, xid;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        tcon = cifs_find_tcon(ses, volume_info->UNC);
        if (tcon) {
@@ -2118,8 +2148,102 @@ cifs_put_tlink(struct tcon_link *tlink)
        return;
 }
+static inline struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb);
+static int
+compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+        struct cifs_sb_info *old = CIFS_SB(sb);
+        struct cifs_sb_info *new = mnt_data->cifs_sb;
+        if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK))
+                return 0;
+        if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) !=
+            (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
+                return 0;
+        if (old->rsize != new->rsize)
+                return 0;
+        /*
+         * We want to share sb only if we don't specify wsize or specified wsize
+         * is greater or equal than existing one.
+         */
+        if (new->wsize && new->wsize < old->wsize)
+                return 0;
+        if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
+                return 0;
+        if (old->mnt_file_mode != new->mnt_file_mode ||
+            old->mnt_dir_mode != new->mnt_dir_mode)
+                return 0;
+        if (strcmp(old->local_nls->charset, new->local_nls->charset))
+                return 0;
+        if (old->actimeo != new->actimeo)
+                return 0;
+        return 1;
+}
 int
-get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
+cifs_match_super(struct super_block *sb, void *data)
+{
+        struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data;
+        struct smb_vol *volume_info;
+        struct cifs_sb_info *cifs_sb;
+        struct TCP_Server_Info *tcp_srv;
+        struct cifs_ses *ses;
+        struct cifs_tcon *tcon;
+        struct tcon_link *tlink;
+        struct sockaddr_storage addr;
+        int rc = 0;
+        memset(&addr, 0, sizeof(struct sockaddr_storage));
+        spin_lock(&cifs_tcp_ses_lock);
+        cifs_sb = CIFS_SB(sb);
+        tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+        if (IS_ERR(tlink)) {
+                spin_unlock(&cifs_tcp_ses_lock);
+                return rc;
+        }
+        tcon = tlink_tcon(tlink);
+        ses = tcon->ses;
+        tcp_srv = ses->server;
+        volume_info = mnt_data->vol;
+        if (!volume_info->UNCip || !volume_info->UNC)
+                goto out;
+        rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+                                volume_info->UNCip,
+                                strlen(volume_info->UNCip),
+                                volume_info->port);
+        if (!rc)
+                goto out;
+        if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
+            !match_session(ses, volume_info) ||
+            !match_tcon(tcon, volume_info->UNC)) {
+                rc = 0;
+                goto out;
+        }
+        rc = compare_mount_options(sb, mnt_data);
+out:
+        cifs_put_tlink(tlink);
+        spin_unlock(&cifs_tcp_ses_lock);
+        return rc;
+}
+int
+get_dfs_path(int xid, struct cifs_ses *pSesInfo, const char *old_path,
             const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
             struct dfs_info3_param **preferrals, int remap)
 {
@@ -2280,7 +2404,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
                smb_buf = (struct smb_hdr *)ses_init_buf;
                /* sizeof RFC1002_SESSION_REQUEST with no scope */
-                smb_buf->smb_buf_length = 0x81000044;
+                smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
                rc = smb_send(server, smb_buf, 0x44);
                kfree(ses_init_buf);
                /*
@@ -2418,7 +2542,7 @@ ip_connect(struct TCP_Server_Info *server)
        return generic_ip_connect(server);
 }
-void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
+void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
                          struct super_block *sb, struct smb_vol *vol_info)
 {
        /* if we are reconnecting then should we check to see if
@@ -2447,7 +2571,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
                __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+                cFYI(1, "unix caps which server supports %lld", cap);
                /* check for reconnect case in which we do not
                   want to change the mount behavior if we can avoid it */
                if (vol_info == NULL) {
@@ -2465,6 +2589,9 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                        }
                }
+                if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+                        cERROR(1, "per-share encryption not supported yet");
                cap &= CIFS_UNIX_CAP_MASK;
                if (vol_info && vol_info->no_psx_acl)
                        cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
@@ -2483,12 +2610,6 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                                        CIFS_MOUNT_POSIX_PATHS;
                }
-                /* We might be setting the path sep back to a different
-                form if we are reconnecting and the server switched its
-                posix path capability for this share */
-                if (sb && (CIFS_SB(sb)->prepathlen > 0))
-                        CIFS_SB(sb)->prepath[0] = CIFS_DIR_SEP(CIFS_SB(sb));
                if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
                                CIFS_SB(sb)->rsize = 127 * 1024;
@@ -2513,6 +2634,10 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                        cFYI(1, "very large read cap");
                if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
                        cFYI(1, "very large write cap");
+                if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP)
+                        cFYI(1, "transport encryption cap");
+                if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+                        cFYI(1, "mandatory transport encryption cap");
 #endif /* CIFS_DEBUG2 */
                if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
                        if (vol_info == NULL) {
@@ -2529,28 +2654,8 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        }
 }
-static void
+void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
-convert_delimiter(char *path, char delim)
+                        struct cifs_sb_info *cifs_sb)
-{
-        int i;
-        char old_delim;
-        if (path == NULL)
-                return;
-        if (delim == '/')
-                old_delim = '\\';
-        else
-                old_delim = '/';
-        for (i = 0; path[i] != '\0'; i++) {
-                if (path[i] == old_delim)
-                        path[i] = delim;
-        }
-}
-static void setup_cifs_sb(struct smb_vol *pvolume_info,
-                          struct cifs_sb_info *cifs_sb)
 {
        INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
@@ -2564,40 +2669,19 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        else /* default */
                cifs_sb->rsize = CIFSMaxBufSize;
-        if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-                cERROR(1, "wsize %d too large, using 4096 instead",
-                          pvolume_info->wsize);
-                cifs_sb->wsize = 4096;
-        } else if (pvolume_info->wsize)
-                cifs_sb->wsize = pvolume_info->wsize;
-        else
-                cifs_sb->wsize = min_t(const int,
-                                        PAGEVEC_SIZE * PAGE_CACHE_SIZE,
-                                        127*1024);
-                /* old default of CIFSMaxBufSize was too small now
-                   that SMB Write2 can send multiple pages in kvec.
-                   RFC1001 does not describe what happens when frame
-                   bigger than 128K is sent so use that as max in
-                   conjunction with 52K kvec constraint on arch with 4K
-                   page size  */
        if (cifs_sb->rsize < 2048) {
                cifs_sb->rsize = 2048;
                /* Windows ME may prefer this */
                cFYI(1, "readsize set to minimum: 2048");
        }
-        /* calculate prepath */
-        cifs_sb->prepath = pvolume_info->prepath;
+        /*
-        if (cifs_sb->prepath) {
+         * Temporarily set wsize for matching superblock. If we end up using
-                cifs_sb->prepathlen = strlen(cifs_sb->prepath);
+         * new sb then cifs_negotiate_wsize will later negotiate it downward
-                /* we can not convert the / to \ in the path
+         * if needed.
-                separators in the prefixpath yet because we do not
+         */
-                know (until reset_cifs_unix_caps is called later)
+        cifs_sb->wsize = pvolume_info->wsize;
-                whether POSIX PATH CAP is available. We normalize
-                the / to \ after reset_cifs_unix_caps is called */
-                pvolume_info->prepath = NULL;
-        } else
-                cifs_sb->prepathlen = 0;
        cifs_sb->mnt_uid = pvolume_info->linux_uid;
        cifs_sb->mnt_gid = pvolume_info->linux_gid;
        cifs_sb->mnt_file_mode = pvolume_info->file_mode;
@@ -2606,6 +2690,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
        cifs_sb->actimeo = pvolume_info->actimeo;
+        cifs_sb->local_nls = pvolume_info->local_nls;
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2625,6 +2710,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
        if (pvolume_info->mand_lock)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
+        if (pvolume_info->rwpidforward)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
        if (pvolume_info->cifs_acl)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
        if (pvolume_info->override_uid)
@@ -2658,8 +2745,55 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                           "mount option supported");
 }
+/*
+ * When the server supports very large writes via POSIX extensions, we can
+ * allow up to 2^24 - PAGE_CACHE_SIZE.
+ *
+ * Note that this might make for "interesting" allocation problems during
+ * writeback however (as we have to allocate an array of pointers for the
+ * pages). A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ */
+#define CIFS_MAX_WSIZE ((1<<24) - PAGE_CACHE_SIZE)
+/*
+ * When the server doesn't allow large posix writes, default to a wsize of
+ * 128k - PAGE_CACHE_SIZE -- one page less than the largest frame size
+ * described in RFC1001. This allows space for the header without going over
+ * that by default.
+ */
+#define CIFS_MAX_RFC1001_WSIZE (128 * 1024 - PAGE_CACHE_SIZE)
+/*
+ * The default wsize is 1M. find_get_pages seems to return a maximum of 256
+ * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
+ * a single wsize request with a single call.
+ */
+#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+static unsigned int
+cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+        __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+        struct TCP_Server_Info *server = tcon->ses->server;
+        unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
+                                CIFS_DEFAULT_WSIZE;
+        /* can server support 24-bit write sizes? (via UNIX extensions) */
+        if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+                wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1001_WSIZE);
+        /* no CAP_LARGE_WRITE_X? Limit it to 16 bits */
+        if (!(server->capabilities & CAP_LARGE_WRITE_X))
+                wsize = min_t(unsigned int, wsize, USHRT_MAX);
+        /* hard limit of CIFS_MAX_WSIZE */
+        wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
+        return wsize;
+}
 static int
-is_path_accessible(int xid, struct cifsTconInfo *tcon,
+is_path_accessible(int xid, struct cifs_tcon *tcon,
                   struct cifs_sb_info *cifs_sb, const char *full_path)
 {
        int rc;
@@ -2682,8 +2816,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
        return rc;
 }
-static void
+void
-cleanup_volume_info(struct smb_vol **pvolume_info)
+cifs_cleanup_volume_info(struct smb_vol **pvolume_info)
 {
        struct smb_vol *volume_info;
@@ -2691,8 +2825,12 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
                return;
        volume_info = *pvolume_info;
+        kfree(volume_info->username);
        kzfree(volume_info->password);
        kfree(volume_info->UNC);
+        kfree(volume_info->UNCip);
+        kfree(volume_info->domainname);
+        kfree(volume_info->iocharset);
        kfree(volume_info->prepath);
        kfree(volume_info);
        *pvolume_info = NULL;
@@ -2709,55 +2847,78 @@ build_unc_path_to_root(const struct smb_vol *volume_info,
        char *full_path;
        int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
-        full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
+        full_path = kmalloc(unc_len + 1, GFP_KERNEL);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
        strncpy(full_path, volume_info->UNC, unc_len);
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+        full_path[unc_len] = 0; /* add trailing null */
-                int i;
+        convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
-                for (i = 0; i < unc_len; i++) {
-                        if (full_path[i] == '\\')
-                                full_path[i] = '/';
-                }
-        }
-        if (cifs_sb->prepathlen)
-                strncpy(full_path + unc_len, cifs_sb->prepath,
-                                cifs_sb->prepathlen);
-        full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
        return full_path;
 }
-#endif
-int
+/*
-cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
+ * Perform a dfs referral query for a share and (optionally) prefix
-                char *mount_data_global, const char *devname)
+ *
+ * If a referral is found, cifs_sb->mountdata will be (re-)allocated
+ * to a string containing updated options for the submount.  Otherwise it
+ * will be left untouched.
+ *
+ * Returns the rc from get_dfs_path to the caller, which can be used to
+ * determine whether there were referrals.
+ */
+static int
+expand_dfs_referral(int xid, struct cifs_ses *pSesInfo,
+                    struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb,
+                    int check_prefix)
 {
        int rc;
-        int xid;
-        struct smb_vol *volume_info;
-        struct cifsSesInfo *pSesInfo;
-        struct cifsTconInfo *tcon;
-        struct TCP_Server_Info *srvTcp;
-        char   *full_path;
-        char *mount_data = mount_data_global;
-        struct tcon_link *tlink;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
-        int referral_walks_count = 0;
+        struct dfs_info3_param *referrals = NULL;
-try_mount_again:
+        char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
+        full_path = build_unc_path_to_root(volume_info, cifs_sb);
+        if (IS_ERR(full_path))
+                return PTR_ERR(full_path);
+        /* For DFS paths, skip the first '\' of the UNC */
+        ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
+        rc = get_dfs_path(xid, pSesInfo , ref_path, cifs_sb->local_nls,
+                          &num_referrals, &referrals,
+                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (!rc && num_referrals > 0) {
+                char *fake_devname = NULL;
+                mdata = cifs_compose_mount_options(cifs_sb->mountdata,
+                                                   full_path + 1, referrals,
+                                                   &fake_devname);
+                free_dfs_info_array(referrals, num_referrals);
+                kfree(fake_devname);
+                if (cifs_sb->mountdata != NULL)
+                        kfree(cifs_sb->mountdata);
+                if (IS_ERR(mdata)) {
+                        rc = PTR_ERR(mdata);
+                        mdata = NULL;
+                }
+                cifs_sb->mountdata = mdata;
+        }
+        kfree(full_path);
+        return rc;
+}
 #endif
-        rc = 0;
-        tcon = NULL;
-        pSesInfo = NULL;
-        srvTcp = NULL;
-        full_path = NULL;
-        tlink = NULL;
-        xid = GetXid();
+int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data,
+                           const char *devname)
+{
+        struct smb_vol *volume_info;
+        int rc = 0;
+        *pvolume_info = NULL;
        volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
        if (!volume_info) {
@@ -2765,7 +2926,8 @@ try_mount_again:
                goto out;
        }
-        if (cifs_parse_mount_options(mount_data, devname, volume_info)) {
+        if (cifs_parse_mount_options(mount_data, devname,
+                                     volume_info)) {
                rc = -EINVAL;
                goto out;
        }
@@ -2797,7 +2959,46 @@ try_mount_again:
                        goto out;
                }
        }
-        cifs_sb->local_nls = volume_info->local_nls;
+        *pvolume_info = volume_info;
+        return rc;
+out:
+        cifs_cleanup_volume_info(&volume_info);
+        return rc;
+}
+int
+cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
+           struct smb_vol *volume_info, const char *devname)
+{
+        int rc = 0;
+        int xid;
+        struct cifs_ses *pSesInfo;
+        struct cifs_tcon *tcon;
+        struct TCP_Server_Info *srvTcp;
+        char   *full_path;
+        struct tcon_link *tlink;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        int referral_walks_count = 0;
+try_mount_again:
+        /* cleanup activities if we're chasing a referral */
+        if (referral_walks_count) {
+                if (tcon)
+                        cifs_put_tcon(tcon);
+                else if (pSesInfo)
+                        cifs_put_smb_ses(pSesInfo);
+                cifs_cleanup_volume_info(&volume_info);
+                FreeXid(xid);
+        }
+#endif
+        tcon = NULL;
+        pSesInfo = NULL;
+        srvTcp = NULL;
+        full_path = NULL;
+        tlink = NULL;
+        xid = GetXid();
        /* get a reference to a tcp session */
        srvTcp = cifs_get_tcp_session(volume_info);
@@ -2814,7 +3015,6 @@ try_mount_again:
                goto mount_fail_check;
        }
-        setup_cifs_sb(volume_info, cifs_sb);
        if (pSesInfo->capabilities & CAP_LARGE_FILES)
                sb->s_maxbytes = MAX_LFS_FILESIZE;
        else
@@ -2831,40 +3031,59 @@ try_mount_again:
                goto remote_path_check;
        }
-        /* do not care if following two calls succeed - informational */
-        if (!tcon->ipc) {
-                CIFSSMBQFSDeviceInfo(xid, tcon);
-                CIFSSMBQFSAttributeInfo(xid, tcon);
-        }
        /* tell server which Unix caps we support */
-        if (tcon->ses->capabilities & CAP_UNIX)
+        if (tcon->ses->capabilities & CAP_UNIX) {
                /* reset of caps checks mount to see if unix extensions
                   disabled for just this mount */
                reset_cifs_unix_caps(xid, tcon, sb, volume_info);
-        else
+                if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+                    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+                     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
+                        rc = -EACCES;
+                        goto mount_fail_check;
+                }
+        } else
                tcon->unix_ext = 0; /* server does not support them */
-        /* convert forward to back slashes in prepath here if needed */
+        /* do not care if following two calls succeed - informational */
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+        if (!tcon->ipc) {
-                convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+                CIFSSMBQFSDeviceInfo(xid, tcon);
+                CIFSSMBQFSAttributeInfo(xid, tcon);
+        }
        if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
                cifs_sb->rsize = 1024 * 127;
                cFYI(DBG2, "no very large read support, rsize now 127K");
        }
-        if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
-                cifs_sb->wsize = min(cifs_sb->wsize,
-                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
        if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
                cifs_sb->rsize = min(cifs_sb->rsize,
                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+        cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
 remote_path_check:
-        /* check if a whole path (including prepath) is not remote */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        /*
+         * Perform an unconditional check for whether there are DFS
+         * referrals for this path without prefix, to provide support
+         * for DFS referrals from w2k8 servers which don't seem to respond
+         * with PATH_NOT_COVERED to requests that include the prefix.
+         * Chase the referral if found, otherwise continue normally.
+         */
+        if (referral_walks_count == 0) {
+                int refrc = expand_dfs_referral(xid, pSesInfo, volume_info,
+                                                cifs_sb, false);
+                if (!refrc) {
+                        referral_walks_count++;
+                        goto try_mount_again;
+                }
+        }
+#endif
+        /* check if a whole path is not remote */
        if (!rc && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
-                full_path = cifs_build_path_to_root(cifs_sb, tcon);
+                full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
@@ -2890,50 +3109,15 @@ remote_path_check:
                        rc = -ELOOP;
                        goto mount_fail_check;
                }
-                /* convert forward to back slashes in prepath here if needed */
-                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-                        convert_delimiter(cifs_sb->prepath,
-                                        CIFS_DIR_SEP(cifs_sb));
-                full_path = build_unc_path_to_root(volume_info, cifs_sb);
-                if (IS_ERR(full_path)) {
-                        rc = PTR_ERR(full_path);
-                        goto mount_fail_check;
-                }
-                cFYI(1, "Getting referral for: %s", full_path);
-                rc = get_dfs_path(xid, pSesInfo , full_path + 1,
-                        cifs_sb->local_nls, &num_referrals, &referrals,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (!rc && num_referrals > 0) {
-                        char *fake_devname = NULL;
-                        if (mount_data != mount_data_global)
+                rc = expand_dfs_referral(xid, pSesInfo, volume_info, cifs_sb,
-                                kfree(mount_data);
+                                         true);
-                        mount_data = cifs_compose_mount_options(
+                if (!rc) {
-                                        cifs_sb->mountdata, full_path + 1,
-                                        referrals, &fake_devname);
-                        free_dfs_info_array(referrals, num_referrals);
-                        kfree(fake_devname);
-                        kfree(full_path);
-                        if (IS_ERR(mount_data)) {
-                                rc = PTR_ERR(mount_data);
-                                mount_data = NULL;
-                                goto mount_fail_check;
-                        }
-                        if (tcon)
-                                cifs_put_tcon(tcon);
-                        else if (pSesInfo)
-                                cifs_put_smb_ses(pSesInfo);
-                        cleanup_volume_info(&volume_info);
                        referral_walks_count++;
-                        FreeXid(xid);
                        goto try_mount_again;
                }
+                goto mount_fail_check;
 #else /* No DFS support, return error on mount */
                rc = -EOPNOTSUPP;
 #endif
@@ -2966,8 +3150,6 @@ remote_path_check:
 mount_fail_check:
        /* on error free sesinfo and tcon struct if needed */
        if (rc) {
-                if (mount_data != mount_data_global)
-                        kfree(mount_data);
                /* If find_unc succeeded then rc == 0 so we can not end */
                /* up accidentally freeing someone elses tcon struct */
                if (tcon)
@@ -2985,14 +3167,13 @@ mount_fail_check:
        password will be freed at unmount time) */
 out:
        /* zero out password before freeing */
-        cleanup_volume_info(&volume_info);
        FreeXid(xid);
        return rc;
 }
 int
-CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
+CIFSTCon(unsigned int xid, struct cifs_ses *ses,
-         const char *tree, struct cifsTconInfo *tcon,
+         const char *tree, struct cifs_tcon *tcon,
         const struct nls_table *nls_codepage)
 {
        struct smb_hdr *smb_buffer;
@@ -3024,7 +3205,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->AndXCommand = 0xFF;
        pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
        bcc_ptr = &pSMB->Password[0];
-        if ((ses->server->secMode) & SECMODE_USER) {
+        if ((ses->server->sec_mode) & SECMODE_USER) {
                pSMB->PasswordLength = cpu_to_le16(1);  /* minimum */
                *bcc_ptr = 0; /* password is null byte */
                bcc_ptr++;              /* skip password */
@@ -3041,7 +3222,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
                        calc_lanman_hash(tcon->password, ses->server->cryptkey,
-                                         ses->server->secMode &
+                                         ses->server->sec_mode &
                                            SECMODE_PW_ENCRYPT ? true : false,
                                         bcc_ptr);
                else
@@ -3057,7 +3238,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                }
        }
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -3083,7 +3264,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        bcc_ptr += strlen("?????");
        bcc_ptr += 1;
        count = bcc_ptr - &pSMB->Password[0];
-        pSMB->hdr.smb_buf_length += count;
+        pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu(
+                                        pSMB->hdr.smb_buf_length) + count);
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
@@ -3152,7 +3334,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        struct rb_root *root = &cifs_sb->tlink_tree;
        struct rb_node *node;
        struct tcon_link *tlink;
-        char *tmp;
        cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
@@ -3169,15 +3350,10 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        }
        spin_unlock(&cifs_sb->tlink_tree_lock);
-        tmp = cifs_sb->prepath;
-        cifs_sb->prepathlen = 0;
-        cifs_sb->prepath = NULL;
-        kfree(tmp);
        return 0;
 }
-int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
+int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
 {
        int rc = 0;
        struct TCP_Server_Info *server = ses->server;
@@ -3207,7 +3383,7 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
 }
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
                        struct nls_table *nls_info)
 {
        int rc = 0;
@@ -3219,7 +3395,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                ses->capabilities &= (~CAP_UNIX);
        cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-                 server->secMode, server->capabilities, server->timeAdj);
+                 server->sec_mode, server->capabilities, server->timeAdj);
        rc = CIFS_SessSetup(xid, ses, nls_info);
        if (rc) {
@@ -3251,14 +3427,16 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
        return rc;
 }
-static struct cifsTconInfo *
+static struct cifs_tcon *
 cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 {
-        struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon = NULL;
+        struct cifs_tcon *tcon = NULL;
        struct smb_vol *vol_info;
-        char username[MAX_USERNAME_SIZE + 1];
+        char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
+                           /* We used to have this as MAX_USERNAME which is   */
+                           /* way too big now (256 instead of 32) */
        vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
        if (vol_info == NULL) {
@@ -3287,7 +3465,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
        ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
        if (IS_ERR(ses)) {
-                tcon = (struct cifsTconInfo *)ses;
+                tcon = (struct cifs_tcon *)ses;
                cifs_put_tcp_session(master_tcon->ses->server);
                goto out;
        }
@@ -3312,7 +3490,7 @@ cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
        return cifs_sb->master_tlink;
 }
-struct cifsTconInfo *
+struct cifs_tcon *
 cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
 {
        return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 9ea65cf36714..81914df47ef1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -50,12 +50,11 @@ build_path_from_dentry(struct dentry *direntry)
 {
        struct dentry *temp;
        int namelen;
-        int pplen;
        int dfsplen;
        char *full_path;
        char dirsep;
        struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        if (direntry == NULL)
                return NULL;  /* not much we can do if dentry is freed and
@@ -63,13 +62,12 @@ build_path_from_dentry(struct dentry *direntry)
                when the server crashed */
        dirsep = CIFS_DIR_SEP(cifs_sb);
-        pplen = cifs_sb->prepathlen;
        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
        else
                dfsplen = 0;
 cifs_bp_rename_retry:
-        namelen = pplen + dfsplen;
+        namelen = dfsplen;
        for (temp = direntry; !IS_ROOT(temp);) {
                namelen += (1 + temp->d_name.len);
                temp = temp->d_parent;
@@ -100,7 +98,7 @@ cifs_bp_rename_retry:
                        return NULL;
                }
        }
-        if (namelen != pplen + dfsplen) {
+        if (namelen != dfsplen) {
                cERROR(1, "did not end path lookup where expected namelen is %d",
                        namelen);
                /* presumably this is only possible if racing with a rename
@@ -126,7 +124,6 @@ cifs_bp_rename_retry:
                        }
                }
        }
-        strncpy(full_path + dfsplen, CIFS_SB(direntry->d_sb)->prepath, pplen);
        return full_path;
 }
@@ -152,7 +149,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        __u16 fileHandle;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        char *full_path = NULL;
        FILE_ALL_INFO *buf = NULL;
        struct inode *newinode = NULL;
@@ -356,7 +353,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
+        struct cifs_io_parms io_parms;
        char *full_path = NULL;
        struct inode *newinode = NULL;
        int oplock = 0;
@@ -439,16 +437,19 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
         * timestamps in, but we can reuse it safely */
        pdev = (struct win_dev *)buf;
+        io_parms.netfid = fileHandle;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = pTcon;
+        io_parms.offset = 0;
+        io_parms.length = sizeof(struct win_dev);
        if (S_ISCHR(mode)) {
                memcpy(pdev->type, "IntxCHR", 8);
                pdev->major =
                      cpu_to_le64(MAJOR(device_number));
                pdev->minor =
                      cpu_to_le64(MINOR(device_number));
-                rc = CIFSSMBWrite(xid, pTcon,
+                rc = CIFSSMBWrite(xid, &io_parms,
-                        fileHandle,
+                        &bytes_written, (char *)pdev,
-                        sizeof(struct win_dev),
-                        0, &bytes_written, (char *)pdev,
                        NULL, 0);
        } else if (S_ISBLK(mode)) {
                memcpy(pdev->type, "IntxBLK", 8);
@@ -456,10 +457,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                      cpu_to_le64(MAJOR(device_number));
                pdev->minor =
                      cpu_to_le64(MINOR(device_number));
-                rc = CIFSSMBWrite(xid, pTcon,
+                rc = CIFSSMBWrite(xid, &io_parms,
-                        fileHandle,
+                        &bytes_written, (char *)pdev,
-                        sizeof(struct win_dev),
-                        0, &bytes_written, (char *)pdev,
                        NULL, 0);
        } /* else if (S_ISFIFO) */
        CIFSSMBClose(xid, pTcon, fileHandle);
@@ -486,7 +485,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsFileInfo *cfile;
        struct inode *newInode = NULL;
        char *full_path = NULL;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 993f82045bf6..55d87ac52000 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
 #include "cifs_debug.h"
 #include "cifsfs.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
        /* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
        .encode_fs =  */
 };
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index faf59529e847..bb71471a4d9d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -114,7 +114,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifs_fattr fattr;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        cFYI(1, "posix open %s", full_path);
@@ -168,7 +168,7 @@ posix_open_ret:
 static int
 cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
-             struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
+             struct cifs_tcon *tcon, unsigned int f_flags, __u32 *poplock,
             __u16 *pnetfid, int xid)
 {
        int rc;
@@ -285,7 +285,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
        struct inode *inode = cifs_file->dentry->d_inode;
-        struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
+        struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsLockInfo *li, *tmp;
@@ -343,7 +343,7 @@ int cifs_open(struct inode *inode, struct file *file)
        int xid;
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink;
        struct cifsFileInfo *pCifsFile = NULL;
        char *full_path = NULL;
@@ -457,7 +457,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
        int xid;
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsInodeInfo *pCifsInode;
        struct inode *inode;
        char *full_path = NULL;
@@ -596,7 +596,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
        xid = GetXid();
        if (pCFileStruct) {
-                struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
+                struct cifs_tcon *pTcon = tlink_tcon(pCFileStruct->tlink);
                cFYI(1, "Freeing private data in close dir");
                spin_lock(&cifs_file_list_lock);
@@ -653,7 +653,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        __u64 length;
        bool wait_flag = false;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u16 netfid;
        __u8 lockType = LOCKING_ANDX_LARGE_FILES;
        bool posix_locking = 0;
@@ -725,8 +725,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                        else
                                posix_lock_type = CIFS_WRLCK;
                        rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
-                                        length, pfLock,
+                                        length, pfLock, posix_lock_type,
-                                        posix_lock_type, wait_flag);
+                                        wait_flag);
                        FreeXid(xid);
                        return rc;
                }
@@ -797,8 +797,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                        posix_lock_type = CIFS_UNLCK;
                rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
-                                      length, pfLock,
+                                      length, pfLock, posix_lock_type,
-                                      posix_lock_type, wait_flag);
+                                      wait_flag);
        } else {
                struct cifsFileInfo *fid = file->private_data;
@@ -857,96 +857,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
                cifsi->server_eof = end_of_write;
 }
-ssize_t cifs_user_write(struct file *file, const char __user *write_data,
+static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid,
-        size_t write_size, loff_t *poffset)
-{
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int rc = 0;
-        unsigned int bytes_written = 0;
-        unsigned int total_written;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
-        int xid;
-        struct cifsFileInfo *open_file;
-        struct cifsInodeInfo *cifsi = CIFS_I(inode);
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
-           *poffset, file->f_path.dentry->d_name.name); */
-        if (file->private_data == NULL)
-                return -EBADF;
-        open_file = file->private_data;
-        pTcon = tlink_tcon(open_file->tlink);
-        rc = generic_write_checks(file, poffset, &write_size, 0);
-        if (rc)
-                return rc;
-        xid = GetXid();
-        for (total_written = 0; write_size > total_written;
-             total_written += bytes_written) {
-                rc = -EAGAIN;
-                while (rc == -EAGAIN) {
-                        if (file->private_data == NULL) {
-                                /* file has been closed on us */
-                                FreeXid(xid);
-                        /* if we have gotten here we have written some data
-                           and blocked, and the file has been freed on us while
-                           we blocked so return what we managed to write */
-                                return total_written;
-                        }
-                        if (open_file->invalidHandle) {
-                                /* we could deadlock if we called
-                                   filemap_fdatawait from here so tell
-                                   reopen_file not to flush data to server
-                                   now */
-                                rc = cifs_reopen_file(open_file, false);
-                                if (rc != 0)
-                                        break;
-                        }
-                        rc = CIFSSMBWrite(xid, pTcon,
-                                open_file->netfid,
-                                min_t(const int, cifs_sb->wsize,
-                                      write_size - total_written),
-                                *poffset, &bytes_written,
-                                NULL, write_data + total_written, 0);
-                }
-                if (rc || (bytes_written == 0)) {
-                        if (total_written)
-                                break;
-                        else {
-                                FreeXid(xid);
-                                return rc;
-                        }
-                } else {
-                        cifs_update_eof(cifsi, *poffset, bytes_written);
-                        *poffset += bytes_written;
-                }
-        }
-        cifs_stats_bytes_written(pTcon, total_written);
-/* Do not update local mtime - server will set its actual value on write
- *      inode->i_ctime = inode->i_mtime =
- *              current_fs_time(inode->i_sb);*/
-        if (total_written > 0) {
-                spin_lock(&inode->i_lock);
-                if (*poffset > inode->i_size)
-                        i_size_write(inode, *poffset);
-                spin_unlock(&inode->i_lock);
-        }
-        mark_inode_dirty_sync(inode);
-        FreeXid(xid);
-        return total_written;
-}
-static ssize_t cifs_write(struct cifsFileInfo *open_file,
                          const char *write_data, size_t write_size,
                          loff_t *poffset)
 {
@@ -954,10 +865,11 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        unsigned int bytes_written = 0;
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        int xid;
        struct dentry *dentry = open_file->dentry;
        struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
+        struct cifs_io_parms io_parms;
        cifs_sb = CIFS_SB(dentry->d_sb);
@@ -990,8 +902,13 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                        /* iov[0] is reserved for smb header */
                        iov[1].iov_base = (char *)write_data + total_written;
                        iov[1].iov_len = len;
-                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, len,
+                        io_parms.netfid = open_file->netfid;
-                                           *poffset, &bytes_written, iov, 1, 0);
+                        io_parms.pid = pid;
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = *poffset;
+                        io_parms.length = len;
+                        rc = CIFSSMBWrite2(xid, &io_parms, &bytes_written, iov,
+                                           1, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1160,8 +1077,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        open_file = find_writable_file(CIFS_I(mapping->host), false);
        if (open_file) {
-                bytes_written = cifs_write(open_file, write_data,
+                bytes_written = cifs_write(open_file, open_file->pid,
-                                           to - from, &offset);
+                                           write_data, to - from, &offset);
                cifsFileInfo_put(open_file);
                /* Does mm or vfs already set times? */
                inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1181,58 +1098,20 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
 {
-        unsigned int bytes_to_write;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
-        unsigned int bytes_written;
+        bool done = false, scanned = false, range_whole = false;
-        struct cifs_sb_info *cifs_sb;
+        pgoff_t end, index;
-        int done = 0;
+        struct cifs_writedata *wdata;
-        pgoff_t end;
-        pgoff_t index;
-        int range_whole = 0;
-        struct kvec *iov;
-        int len;
-        int n_iov = 0;
-        pgoff_t next;
-        int nr_pages;
-        __u64 offset = 0;
-        struct cifsFileInfo *open_file;
-        struct cifsTconInfo *tcon;
-        struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
        struct page *page;
-        struct pagevec pvec;
        int rc = 0;
-        int scanned = 0;
-        int xid;
-        cifs_sb = CIFS_SB(mapping->host->i_sb);
        /*
-         * If wsize is smaller that the page cache size, default to writing
+         * If wsize is smaller than the page cache size, default to writing
         * one page at a time via cifs_writepage
         */
        if (cifs_sb->wsize < PAGE_CACHE_SIZE)
                return generic_writepages(mapping, wbc);
-        iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
-        if (iov == NULL)
-                return generic_writepages(mapping, wbc);
-        /*
-         * if there's no open file, then this is likely to fail too,
-         * but it'll at least handle the return. Maybe it should be
-         * a BUG() instead?
-         */
-        open_file = find_writable_file(CIFS_I(mapping->host), false);
-        if (!open_file) {
-                kfree(iov);
-                return generic_writepages(mapping, wbc);
-        }
-        tcon = tlink_tcon(open_file->tlink);
-        cifsFileInfo_put(open_file);
-        xid = GetXid();
-        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
                end = -1;
@@ -1240,24 +1119,49 @@ static int cifs_writepages(struct address_space *mapping,
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-                        range_whole = 1;
+                        range_whole = true;
-                scanned = 1;
+                scanned = true;
        }
 retry:
-        while (!done && (index <= end) &&
+        while (!done && index <= end) {
-               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                unsigned int i, nr_pages, found_pages;
-                        PAGECACHE_TAG_DIRTY,
+                pgoff_t next = 0, tofind;
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1))) {
+                struct page **pages;
-                int first;
-                unsigned int i;
+                tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
+                                end - index) + 1;
-                first = -1;
-                next = 0;
+                wdata = cifs_writedata_alloc((unsigned int)tofind);
-                n_iov = 0;
+                if (!wdata) {
-                bytes_to_write = 0;
+                        rc = -ENOMEM;
+                        break;
-                for (i = 0; i < nr_pages; i++) {
+                }
-                        page = pvec.pages[i];
+                /*
+                 * find_get_pages_tag seems to return a max of 256 on each
+                 * iteration, so we must call it several times in order to
+                 * fill the array or the wsize is effectively limited to
+                 * 256 * PAGE_CACHE_SIZE.
+                 */
+                found_pages = 0;
+                pages = wdata->pages;
+                do {
+                        nr_pages = find_get_pages_tag(mapping, &index,
+                                                        PAGECACHE_TAG_DIRTY,
+                                                        tofind, pages);
+                        found_pages += nr_pages;
+                        tofind -= nr_pages;
+                        pages += nr_pages;
+                } while (nr_pages && tofind && index <= end);
+                if (found_pages == 0) {
+                        kref_put(&wdata->refcount, cifs_writedata_release);
+                        break;
+                }
+                nr_pages = 0;
+                for (i = 0; i < found_pages; i++) {
+                        page = wdata->pages[i];
                        /*
                         * At this point we hold neither mapping->tree_lock nor
                         * lock on the page itself: the page may be truncated or
@@ -1266,7 +1170,7 @@ retry:
                         * mapping
                         */
-                        if (first < 0)
+                        if (nr_pages == 0)
                                lock_page(page);
                        else if (!trylock_page(page))
                                break;
@@ -1277,7 +1181,7 @@ retry:
                        }
                        if (!wbc->range_cyclic && page->index > end) {
-                                done = 1;
+                                done = true;
                                unlock_page(page);
                                break;
                        }
@@ -1304,125 +1208,96 @@ retry:
                        set_page_writeback(page);
                        if (page_offset(page) >= mapping->host->i_size) {
-                                done = 1;
+                                done = true;
                                unlock_page(page);
                                end_page_writeback(page);
                                break;
                        }
-                        /*
+                        wdata->pages[i] = page;
-                         * BB can we get rid of this?  pages are held by pvec
+                        next = page->index + 1;
-                         */
+                        ++nr_pages;
-                        page_cache_get(page);
+                }
-                        len = min(mapping->host->i_size - page_offset(page),
+                /* reset index to refind any pages skipped */
-                                  (loff_t)PAGE_CACHE_SIZE);
+                if (nr_pages == 0)
+                        index = wdata->pages[0]->index + 1;
-                        /* reserve iov[0] for the smb header */
+                /* put any pages we aren't going to use */
-                        n_iov++;
+                for (i = nr_pages; i < found_pages; i++) {
-                        iov[n_iov].iov_base = kmap(page);
+                        page_cache_release(wdata->pages[i]);
-                        iov[n_iov].iov_len = len;
+                        wdata->pages[i] = NULL;
-                        bytes_to_write += len;
+                }
-                        if (first < 0) {
+                /* nothing to write? */
-                                first = i;
+                if (nr_pages == 0) {
-                                offset = page_offset(page);
+                        kref_put(&wdata->refcount, cifs_writedata_release);
-                        }
+                        continue;
-                        next = page->index + 1;
-                        if (bytes_to_write + PAGE_CACHE_SIZE > cifs_sb->wsize)
-                                break;
                }
-                if (n_iov) {
-retry_write:
-                        open_file = find_writable_file(CIFS_I(mapping->host),
-                                                        false);
-                        if (!open_file) {
-                                cERROR(1, "No writable handles for inode");
-                                rc = -EBADF;
-                        } else {
-                                rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
-                                                   bytes_to_write, offset,
-                                                   &bytes_written, iov, n_iov,
-                                                   0);
-                                cifsFileInfo_put(open_file);
-                        }
-                        cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
+                wdata->sync_mode = wbc->sync_mode;
+                wdata->nr_pages = nr_pages;
+                wdata->offset = page_offset(wdata->pages[0]);
-                        /*
+                do {
-                         * For now, treat a short write as if nothing got
+                        if (wdata->cfile != NULL)
-                         * written. A zero length write however indicates
+                                cifsFileInfo_put(wdata->cfile);
-                         * ENOSPC or EFBIG. We have no way to know which
+                        wdata->cfile = find_writable_file(CIFS_I(mapping->host),
-                         * though, so call it ENOSPC for now. EFBIG would
+                                                          false);
-                         * get translated to AS_EIO anyway.
+                        if (!wdata->cfile) {
-                         *
+                                cERROR(1, "No writable handles for inode");
-                         * FIXME: make it take into account the data that did
+                                rc = -EBADF;
-                         *        get written
+                                break;
-                         */
-                        if (rc == 0) {
-                                if (bytes_written == 0)
-                                        rc = -ENOSPC;
-                                else if (bytes_written < bytes_to_write)
-                                        rc = -EAGAIN;
                        }
+                        rc = cifs_async_writev(wdata);
+                } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
-                        /* retry on data-integrity flush */
+                for (i = 0; i < nr_pages; ++i)
-                        if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+                        unlock_page(wdata->pages[i]);
-                                goto retry_write;
-                        /* fix the stats and EOF */
+                /* send failure -- clean up the mess */
-                        if (bytes_written > 0) {
+                if (rc != 0) {
-                                cifs_stats_bytes_written(tcon, bytes_written);
+                        for (i = 0; i < nr_pages; ++i) {
-                                cifs_update_eof(cifsi, offset, bytes_written);
-                        }
-                        for (i = 0; i < n_iov; i++) {
-                                page = pvec.pages[first + i];
-                                /* on retryable write error, redirty page */
                                if (rc == -EAGAIN)
-                                        redirty_page_for_writepage(wbc, page);
+                                        redirty_page_for_writepage(wbc,
-                                else if (rc != 0)
+                                                           wdata->pages[i]);
-                                        SetPageError(page);
+                                else
-                                kunmap(page);
+                                        SetPageError(wdata->pages[i]);
-                                unlock_page(page);
+                                end_page_writeback(wdata->pages[i]);
-                                end_page_writeback(page);
+                                page_cache_release(wdata->pages[i]);
-                                page_cache_release(page);
                        }
                        if (rc != -EAGAIN)
                                mapping_set_error(mapping, rc);
-                        else
+                }
-                                rc = 0;
+                kref_put(&wdata->refcount, cifs_writedata_release);
-                        if ((wbc->nr_to_write -= n_iov) <= 0)
+                wbc->nr_to_write -= nr_pages;
-                                done = 1;
+                if (wbc->nr_to_write <= 0)
-                        index = next;
+                        done = true;
-                } else
-                        /* Need to re-find the pages we skipped */
-                        index = pvec.pages[0]->index + 1;
-                pagevec_release(&pvec);
+                index = next;
        }
        if (!scanned && !done) {
                /*
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
-                scanned = 1;
+                scanned = true;
                index = 0;
                goto retry;
        }
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                mapping->writeback_index = index;
-        FreeXid(xid);
-        kfree(iov);
        return rc;
 }
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
-        int rc = -EFAULT;
+        int rc;
        int xid;
        xid = GetXid();
@@ -1442,21 +1317,43 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
         * to fail to update with the state of the page correctly.
         */
        set_page_writeback(page);
+retry_write:
        rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
-        SetPageUptodate(page); /* BB add check for error and Clearuptodate? */
+        if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
-        unlock_page(page);
+                goto retry_write;
+        else if (rc == -EAGAIN)
+                redirty_page_for_writepage(wbc, page);
+        else if (rc != 0)
+                SetPageError(page);
+        else
+                SetPageUptodate(page);
        end_page_writeback(page);
        page_cache_release(page);
        FreeXid(xid);
        return rc;
 }
+static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        int rc = cifs_writepage_locked(page, wbc);
+        unlock_page(page);
+        return rc;
+}
 static int cifs_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
 {
        int rc;
        struct inode *inode = mapping->host;
+        struct cifsFileInfo *cfile = file->private_data;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+        __u32 pid;
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = cfile->pid;
+        else
+                pid = current->tgid;
        cFYI(1, "write_end for page %p from pos %lld with %d bytes",
                 page, pos, copied);
@@ -1480,8 +1377,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
                /* BB check if anything else missing out of ppw
                   such as updating last write time */
                page_data = kmap(page);
-                rc = cifs_write(file->private_data, page_data + offset,
+                rc = cifs_write(cfile, pid, page_data + offset, copied, &pos);
-                                copied, &pos);
                /* if (rc < 0) should we set writebehind rc? */
                kunmap(page);
@@ -1509,7 +1405,7 @@ int cifs_strict_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsFileInfo *smbfile = file->private_data;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1519,8 +1415,13 @@ int cifs_strict_fsync(struct file *file, int datasync)
        cFYI(1, "Sync file - name: %s datasync: 0x%x",
                file->f_path.dentry->d_name.name, datasync);
-        if (!CIFS_I(inode)->clientCanCacheRead)
+        if (!CIFS_I(inode)->clientCanCacheRead) {
-                cifs_invalidate_mapping(inode);
+                rc = cifs_invalidate_mapping(inode);
+                if (rc) {
+                        cFYI(1, "rc: %d during invalidate phase", rc);
+                        rc = 0; /* don't care about it in fsync */
+                }
+        }
        tcon = tlink_tcon(smbfile->tlink);
        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
@@ -1534,7 +1435,7 @@ int cifs_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsFileInfo *smbfile = file->private_data;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1625,9 +1526,11 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
        struct iov_iter it;
        struct inode *inode;
        struct cifsFileInfo *open_file;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifs_sb_info *cifs_sb;
+        struct cifs_io_parms io_parms;
        int xid, rc;
+        __u32 pid;
        len = iov_length(iov, nr_segs);
        if (!len)
@@ -1659,6 +1562,12 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
        xid = GetXid();
        open_file = file->private_data;
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        pTcon = tlink_tcon(open_file->tlink);
        inode = file->f_path.dentry->d_inode;
@@ -1685,9 +1594,13 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
+                        io_parms.netfid = open_file->netfid;
-                                           cur_len, *poffset, &written,
+                        io_parms.pid = pid;
-                                           to_send, npages, 0);
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = *poffset;
+                        io_parms.length = cur_len;
+                        rc = CIFSSMBWrite2(xid, &io_parms, &written, to_send,
+                                           npages, 0);
                } while (rc == -EAGAIN);
                for (i = 0; i < npages; i++)
@@ -1726,7 +1639,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
        return total_written;
 }
-static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos)
 {
        ssize_t written;
@@ -1780,10 +1693,12 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        size_t len, cur_len;
        int iov_offset = 0;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsFileInfo *open_file;
        struct smb_com_read_rsp *pSMBr;
+        struct cifs_io_parms io_parms;
        char *read_data;
+        __u32 pid;
        if (!nr_segs)
                return 0;
@@ -1798,6 +1713,11 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1813,8 +1733,12 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
+                        io_parms.netfid = open_file->netfid;
-                                         cur_len, *poffset, &bytes_read,
+                        io_parms.pid = pid;
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = *poffset;
+                        io_parms.length = len;
+                        rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
                                         &read_data, &buf_type);
                        pSMBr = (struct smb_com_read_rsp *)read_data;
                        if (read_data) {
@@ -1849,17 +1773,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        return total_read;
 }
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-                       size_t read_size, loff_t *poffset)
-{
-        struct iovec iov;
-        iov.iov_base = read_data;
-        iov.iov_len = read_size;
-        return cifs_iovec_read(file, &iov, 1, poffset);
-}
-static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
        ssize_t read;
@@ -1901,11 +1815,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        unsigned int total_read;
        unsigned int current_read_size;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        int xid;
        char *current_offset;
        struct cifsFileInfo *open_file;
+        struct cifs_io_parms io_parms;
        int buf_type = CIFS_NO_BUFFER;
+        __u32 pid;
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1918,6 +1834,11 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1940,11 +1861,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBRead(xid, pTcon,
+                        io_parms.netfid = open_file->netfid;
-                                         open_file->netfid,
+                        io_parms.pid = pid;
-                                         current_read_size, *poffset,
+                        io_parms.tcon = pTcon;
-                                         &bytes_read, &current_offset,
+                        io_parms.offset = *poffset;
-                                         &buf_type);
+                        io_parms.length = current_read_size;
+                        rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
+                                         &current_offset, &buf_type);
                }
                if (rc || (bytes_read == 0)) {
                        if (total_read) {
@@ -1987,8 +1910,11 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
        xid = GetXid();
-        if (!CIFS_I(inode)->clientCanCacheRead)
+        if (!CIFS_I(inode)->clientCanCacheRead) {
-                cifs_invalidate_mapping(inode);
+                rc = cifs_invalidate_mapping(inode);
+                if (rc)
+                        return rc;
+        }
        rc = generic_file_mmap(file, vma);
        if (rc == 0)
@@ -2072,13 +1998,15 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        loff_t offset;
        struct page *page;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        unsigned int bytes_read = 0;
        unsigned int read_size, i;
        char *smb_read_data = NULL;
        struct smb_com_read_rsp *pSMBr;
        struct cifsFileInfo *open_file;
+        struct cifs_io_parms io_parms;
        int buf_type = CIFS_NO_BUFFER;
+        __u32 pid;
        xid = GetXid();
        if (file->private_data == NULL) {
@@ -2100,6 +2028,11 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                goto read_complete;
        cFYI(DBG2, "rpages: num pages %d", num_pages);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
                struct page *tmp_page;
@@ -2141,12 +2074,13 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                if (rc != 0)
                                        break;
                        }
+                        io_parms.netfid = open_file->netfid;
-                        rc = CIFSSMBRead(xid, pTcon,
+                        io_parms.pid = pid;
-                                         open_file->netfid,
+                        io_parms.tcon = pTcon;
-                                         read_size, offset,
+                        io_parms.offset = offset;
-                                         &bytes_read, &smb_read_data,
+                        io_parms.length = read_size;
-                                         &buf_type);
+                        rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
+                                         &smb_read_data, &buf_type);
                        /* BB more RC checks ? */
                        if (rc == -EAGAIN) {
                                if (smb_read_data) {
@@ -2415,6 +2349,27 @@ static void cifs_invalidate_page(struct page *page, unsigned long offset)
                cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
+static int cifs_launder_page(struct page *page)
+{
+        int rc = 0;
+        loff_t range_start = page_offset(page);
+        loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = 0,
+                .range_start = range_start,
+                .range_end = range_end,
+        };
+        cFYI(1, "Launder page: %p", page);
+        if (clear_page_dirty_for_io(page))
+                rc = cifs_writepage_locked(page, &wbc);
+        cifs_fscache_invalidate_page(page, page->mapping->host);
+        return rc;
+}
 void cifs_oplock_break(struct work_struct *work)
 {
        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -2486,7 +2441,7 @@ const struct address_space_operations cifs_addr_ops = {
        .set_page_dirty = __set_page_dirty_nobuffers,
        .releasepage = cifs_release_page,
        .invalidatepage = cifs_invalidate_page,
-        /* .direct_IO = */
+        .launder_page = cifs_launder_page,
 };
 /*
@@ -2503,5 +2458,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
        .set_page_dirty = __set_page_dirty_nobuffers,
        .releasepage = cifs_release_page,
        .invalidatepage = cifs_invalidate_page,
-        /* .direct_IO = */
+        .launder_page = cifs_launder_page,
 };
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 297a43d0ff7f..d368a47ba5eb 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -40,7 +40,7 @@ void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
        server->fscache = NULL;
 }
-void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
+void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
 {
        struct TCP_Server_Info *server = tcon->ses->server;
@@ -51,7 +51,7 @@ void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
                                server->fscache, tcon->fscache);
 }
-void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
+void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
 {
        cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
        fscache_relinquish_cookie(tcon->fscache, 0);
@@ -62,7 +62,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 {
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        if (cifsi->fscache)
                return;
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 31b88ec2341e..63539323e0b9 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -40,8 +40,8 @@ extern void cifs_fscache_unregister(void);
 */
 extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
 extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
-extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
-extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
 extern void cifs_fscache_release_inode_cookie(struct inode *);
 extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
@@ -99,9 +99,9 @@ static inline void
 cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
 static inline void
 cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
-static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {}
 static inline void
-cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
+cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
 static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
 static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8852470b4fbb..9b018c8334fa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -295,7 +295,7 @@ int cifs_get_file_info_unix(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsFileInfo *cfile = filp->private_data;
-        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -318,7 +318,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        int rc;
        FILE_UNIX_BASIC_INFO find_data;
        struct cifs_fattr fattr;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -373,7 +373,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        int oplock = 0;
        __u16 netfid;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
+        struct cifs_io_parms io_parms;
        char buf[24];
        unsigned int bytes_read;
        char *pbuf;
@@ -405,9 +406,13 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        if (rc == 0) {
                int buf_type = CIFS_NO_BUFFER;
                        /* Read header */
-                rc = CIFSSMBRead(xid, tcon, netfid,
+                io_parms.netfid = netfid;
-                                 24 /* length */, 0 /* offset */,
+                io_parms.pid = current->tgid;
-                                 &bytes_read, &pbuf, &buf_type);
+                io_parms.tcon = tcon;
+                io_parms.offset = 0;
+                io_parms.length = 24;
+                rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf,
+                                 &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
                                cFYI(1, "Block device");
@@ -468,7 +473,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        char ea_value[4];
        __u32 mode;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -502,7 +507,7 @@ static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
@@ -553,7 +558,7 @@ int cifs_get_file_info(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsFileInfo *cfile = filp->private_data;
-        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -590,7 +595,7 @@ int cifs_get_inode_info(struct inode **pinode,
        struct super_block *sb, int xid, const __u16 *pfid)
 {
        int rc = 0, tmprc;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        char *buf = NULL;
@@ -735,10 +740,10 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                                struct cifsTconInfo *tcon)
+                              struct cifs_tcon *tcon)
 {
-        int pplen = cifs_sb->prepathlen;
+        int pplen = vol->prepath ? strlen(vol->prepath) : 0;
        int dfsplen;
        char *full_path = NULL;
@@ -772,7 +777,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
                        }
                }
        }
-        strncpy(full_path + dfsplen, cifs_sb->prepath, pplen);
+        strncpy(full_path + dfsplen, vol->prepath, pplen);
        full_path[dfsplen + pplen] = 0; /* add trailing null */
        return full_path;
 }
@@ -878,25 +883,19 @@ retry_iget5_locked:
 }
 /* gets root inode */
-struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb)
 {
        int xid;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct inode *inode = NULL;
        long rc;
-        char *full_path;
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-        full_path = cifs_build_path_to_root(cifs_sb, tcon);
-        if (full_path == NULL)
-                return ERR_PTR(-ENOMEM);
        xid = GetXid();
        if (tcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+                rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
        else
-                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
+                rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
-                                                xid, NULL);
        if (!inode) {
                inode = ERR_PTR(rc);
@@ -922,7 +921,6 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
        }
 out:
-        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
         * TODO: This is no longer true
         */
@@ -943,7 +941,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        FILE_BASIC_INFO info_buf;
        if (attrs == NULL)
@@ -1061,7 +1059,7 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u32 dosattr, origattr;
        FILE_BASIC_INFO *info_buf = NULL;
@@ -1179,7 +1177,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        struct super_block *sb = dir->i_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct iattr *attrs = NULL;
        __u32 dosattr = 0, origattr = 0;
@@ -1277,7 +1275,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
        struct cifs_fattr fattr;
@@ -1455,7 +1453,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
@@ -1512,7 +1510,7 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        __u16 srcfid;
        int oplock, rc;
@@ -1564,7 +1562,7 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        char *toName = NULL;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
        FILE_UNIX_BASIC_INFO *info_buf_target;
        int xid, rc, tmprc;
@@ -1683,71 +1681,70 @@ cifs_inode_needs_reval(struct inode *inode)
 /*
 * Zap the cache. Called when invalid_mapping flag is set.
 */
-void
+int
 cifs_invalidate_mapping(struct inode *inode)
 {
-        int rc;
+        int rc = 0;
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
        cifs_i->invalid_mapping = false;
-        /* write back any cached data */
        if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
-                rc = filemap_write_and_wait(inode->i_mapping);
+                rc = invalidate_inode_pages2(inode->i_mapping);
-                mapping_set_error(inode->i_mapping, rc);
+                if (rc) {
+                        cERROR(1, "%s: could not invalidate inode %p", __func__,
+                               inode);
+                        cifs_i->invalid_mapping = true;
+                }
        }
-        invalidate_remote_inode(inode);
        cifs_fscache_reset_inode_cookie(inode);
+        return rc;
 }
-int cifs_revalidate_file(struct file *filp)
+int cifs_revalidate_file_attr(struct file *filp)
 {
        int rc = 0;
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
        if (!cifs_inode_needs_reval(inode))
-                goto check_inval;
+                return rc;
        if (tlink_tcon(cfile->tlink)->unix_ext)
                rc = cifs_get_file_info_unix(filp);
        else
                rc = cifs_get_file_info(filp);
-check_inval:
-        if (CIFS_I(inode)->invalid_mapping)
-                cifs_invalidate_mapping(inode);
        return rc;
 }
-/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry_attr(struct dentry *dentry)
-int cifs_revalidate_dentry(struct dentry *dentry)
 {
        int xid;
        int rc = 0;
-        char *full_path = NULL;
        struct inode *inode = dentry->d_inode;
        struct super_block *sb = dentry->d_sb;
+        char *full_path = NULL;
        if (inode == NULL)
                return -ENOENT;
-        xid = GetXid();
        if (!cifs_inode_needs_reval(inode))
-                goto check_inval;
+                return rc;
+        xid = GetXid();
        /* can not safely grab the rename sem here if rename calls revalidate
           since that would deadlock */
        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                goto check_inval;
+                goto out;
        }
-        cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
+        cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time "
-                 "jiffies %ld", full_path, inode, inode->i_count.counter,
+                 "%ld jiffies %ld", full_path, inode, inode->i_count.counter,
                 dentry, dentry->d_time, jiffies);
        if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
@@ -1756,41 +1753,83 @@ int cifs_revalidate_dentry(struct dentry *dentry)
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
                                         xid, NULL);
-check_inval:
+out:
-        if (CIFS_I(inode)->invalid_mapping)
-                cifs_invalidate_mapping(inode);
        kfree(full_path);
        FreeXid(xid);
        return rc;
 }
+int cifs_revalidate_file(struct file *filp)
+{
+        int rc;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        rc = cifs_revalidate_file_attr(filp);
+        if (rc)
+                return rc;
+        if (CIFS_I(inode)->invalid_mapping)
+                rc = cifs_invalidate_mapping(inode);
+        return rc;
+}
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+        int rc;
+        struct inode *inode = dentry->d_inode;
+        rc = cifs_revalidate_dentry_attr(dentry);
+        if (rc)
+                return rc;
+        if (CIFS_I(inode)->invalid_mapping)
+                rc = cifs_invalidate_mapping(inode);
+        return rc;
+}
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                 struct kstat *stat)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-        int err = cifs_revalidate_dentry(dentry);
+        struct inode *inode = dentry->d_inode;
+        int rc;
-        if (!err) {
-                generic_fillattr(dentry->d_inode, stat);
-                stat->blksize = CIFS_MAX_MSGSIZE;
-                stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
-                /*
+        /*
-                 * If on a multiuser mount without unix extensions, and the
+         * We need to be sure that all dirty pages are written and the server
-                 * admin hasn't overridden them, set the ownership to the
+         * has actual ctime, mtime and file length.
-                 * fsuid/fsgid of the current process.
+         */
-                 */
+        if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
-                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+            inode->i_mapping->nrpages != 0) {
-                    !tcon->unix_ext) {
+                rc = filemap_fdatawait(inode->i_mapping);
-                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+                if (rc) {
-                                stat->uid = current_fsuid();
+                        mapping_set_error(inode->i_mapping, rc);
-                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+                        return rc;
-                                stat->gid = current_fsgid();
                }
        }
-        return err;
+        rc = cifs_revalidate_dentry_attr(dentry);
+        if (rc)
+                return rc;
+        generic_fillattr(inode, stat);
+        stat->blksize = CIFS_MAX_MSGSIZE;
+        stat->ino = CIFS_I(inode)->uniqueid;
+        /*
+         * If on a multiuser mount without unix extensions, and the admin hasn't
+         * overridden them, set the ownership to the fsuid/fsgid of the current
+         * process.
+         */
+        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+            !tcon->unix_ext) {
+                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+                        stat->uid = current_fsuid();
+                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+                        stat->gid = current_fsgid();
+        }
+        return rc;
 }
 static int cifs_truncate_page(struct address_space *mapping, loff_t from)
@@ -1831,7 +1870,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *pTcon = NULL;
+        struct cifs_tcon *pTcon = NULL;
+        struct cifs_io_parms io_parms;
        /*
         * To avoid spurious oplock breaks from server, in the case of
@@ -1853,8 +1893,14 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                cFYI(1, "SetFSize for attrs rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        unsigned int bytes_written;
-                        rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
-                                          &bytes_written, NULL, NULL, 1);
+                        io_parms.netfid = nfid;
+                        io_parms.pid = npid;
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = 0;
+                        io_parms.length = attrs->ia_size;
+                        rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
+                                          NULL, NULL, 1);
                        cFYI(1, "Wrt seteof rc %d", rc);
                }
        } else
@@ -1889,10 +1935,15 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc == 0) {
                                unsigned int bytes_written;
-                                rc = CIFSSMBWrite(xid, pTcon, netfid, 0,
-                                                  attrs->ia_size,
+                                io_parms.netfid = netfid;
-                                                  &bytes_written, NULL,
+                                io_parms.pid = current->tgid;
-                                                  NULL, 1);
+                                io_parms.tcon = pTcon;
+                                io_parms.offset = 0;
+                                io_parms.length = attrs->ia_size;
+                                rc = CIFSSMBWrite(xid, &io_parms,
+                                                  &bytes_written,
+                                                  NULL, NULL,  1);
                                cFYI(1, "wrt seteof rc %d", rc);
                                CIFSSMBClose(xid, pTcon, netfid);
                        }
@@ -1920,7 +1971,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifs_unix_set_info_args *args = NULL;
        struct cifsFileInfo *open_file;
@@ -2206,7 +2257,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
        struct inode *inode = direntry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
        if (pTcon->unix_ext)
                return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 0c98672d0122..4221b5e48a42 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,7 +38,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
        struct cifsFileInfo *pSMBFile = filep->private_data;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u64   ExtAttrBits = 0;
        __u64   ExtAttrMask = 0;
        __u64   caps;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index ce417a9764a3..556b1a0b54de 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -175,7 +175,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 }
 static int
-CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
                    const char *fromName, const char *toName,
                    const struct nls_table *nls_codepage, int remap)
 {
@@ -184,6 +184,7 @@ CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
        __u16 netfid = 0;
        u8 *buf;
        unsigned int bytes_written = 0;
+        struct cifs_io_parms io_parms;
        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
        if (!buf)
@@ -203,10 +204,13 @@ CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
                return rc;
        }
-        rc = CIFSSMBWrite(xid, tcon, netfid,
+        io_parms.netfid = netfid;
-                          CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+        io_parms.pid = current->tgid;
-                          0 /* offset */,
+        io_parms.tcon = tcon;
-                          &bytes_written, buf, NULL, 0);
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+        rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0);
        CIFSSMBClose(xid, tcon, netfid);
        kfree(buf);
        if (rc != 0)
@@ -219,7 +223,7 @@ CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
 }
 static int
-CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSQueryMFSymLink(const int xid, struct cifs_tcon *tcon,
                   const unsigned char *searchName, char **symlinkinfo,
                   const struct nls_table *nls_codepage, int remap)
 {
@@ -231,6 +235,7 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
        unsigned int bytes_read = 0;
        int buf_type = CIFS_NO_BUFFER;
        unsigned int link_len = 0;
+        struct cifs_io_parms io_parms;
        FILE_ALL_INFO file_info;
        rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
@@ -249,11 +254,13 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
        if (!buf)
                return -ENOMEM;
        pbuf = buf;
+        io_parms.netfid = netfid;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = tcon;
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBRead(xid, tcon, netfid,
+        rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
-                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
-                         0 /* offset */,
-                         &bytes_read, &pbuf, &buf_type);
        CIFSSMBClose(xid, tcon, netfid);
        if (rc != 0) {
                kfree(buf);
@@ -291,7 +298,8 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
        int oplock = 0;
        __u16 netfid = 0;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
+        struct cifs_io_parms io_parms;
        u8 *buf;
        char *pbuf;
        unsigned int bytes_read = 0;
@@ -328,11 +336,13 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
                goto out;
        }
        pbuf = buf;
+        io_parms.netfid = netfid;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = pTcon;
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBRead(xid, pTcon, netfid,
+        rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
-                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
-                         0 /* offset */,
-                         &bytes_read, &pbuf, &buf_type);
        CIFSSMBClose(xid, pTcon, netfid);
        if (rc != 0) {
                kfree(buf);
@@ -370,7 +380,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
        char *toName = NULL;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsInodeInfo *cifsInode;
        tlink = cifs_sb_tlink(cifs_sb);
@@ -445,7 +455,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        char *target_path = NULL;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        xid = GetXid();
@@ -518,7 +528,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
        int xid;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0c684ae4c071..03a1f491d39b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -67,12 +67,12 @@ _FreeXid(unsigned int xid)
        spin_unlock(&GlobalMid_Lock);
 }
-struct cifsSesInfo *
+struct cifs_ses *
 sesInfoAlloc(void)
 {
-        struct cifsSesInfo *ret_buf;
+        struct cifs_ses *ret_buf;
-        ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
+        ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
        if (ret_buf) {
                atomic_inc(&sesInfoAllocCount);
                ret_buf->status = CifsNew;
@@ -85,7 +85,7 @@ sesInfoAlloc(void)
 }
 void
-sesInfoFree(struct cifsSesInfo *buf_to_free)
+sesInfoFree(struct cifs_ses *buf_to_free)
 {
        if (buf_to_free == NULL) {
                cFYI(1, "Null buffer passed to sesInfoFree");
@@ -105,11 +105,11 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
        kfree(buf_to_free);
 }
-struct cifsTconInfo *
+struct cifs_tcon *
 tconInfoAlloc(void)
 {
-        struct cifsTconInfo *ret_buf;
+        struct cifs_tcon *ret_buf;
-        ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
+        ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
        if (ret_buf) {
                atomic_inc(&tconInfoAllocCount);
                ret_buf->tidStatus = CifsNew;
@@ -124,7 +124,7 @@ tconInfoAlloc(void)
 }
 void
-tconInfoFree(struct cifsTconInfo *buf_to_free)
+tconInfoFree(struct cifs_tcon *buf_to_free)
 {
        if (buf_to_free == NULL) {
                cFYI(1, "Null buffer passed to tconInfoFree");
@@ -295,21 +295,19 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
   case it is responsbility of caller to set the mid */
 void
 header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
-                const struct cifsTconInfo *treeCon, int word_count
+                const struct cifs_tcon *treeCon, int word_count
                /* length of fixed section (word count) in two byte units  */)
 {
        struct list_head *temp_item;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        char *temp = (char *) buffer;
        memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
-        buffer->smb_buf_length =
+        buffer->smb_buf_length = cpu_to_be32(
            (2 * word_count) + sizeof(struct smb_hdr) -
            4 /*  RFC 1001 length field does not count */  +
-            2 /* for bcc field itself */ ;
+            2 /* for bcc field itself */) ;
-        /* Note that this is the only network field that has to be converted
-           to big endian and it is done just before we send it */
        buffer->Protocol[0] = 0xFF;
        buffer->Protocol[1] = 'S';
@@ -361,7 +359,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                                 "did not match tcon uid");
                                        spin_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
-                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
+                                                ses = list_entry(temp_item, struct cifs_ses, smb_ses_list);
                                                if (ses->linux_uid == current_fsuid()) {
                                                        if (ses->server == treeCon->ses->server) {
                                                                cFYI(1, "found matching uid substitute right smb_uid");
@@ -382,7 +380,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                if (treeCon->nocase)
                        buffer->Flags  |= SMBFLG_CASELESS;
                if ((treeCon->ses) && (treeCon->ses->server))
-                        if (treeCon->ses->server->secMode &
+                        if (treeCon->ses->server->sec_mode &
                          (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                                buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        }
@@ -424,7 +422,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 int
 checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
-        __u32 len = smb->smb_buf_length;
+        __u32 len = be32_to_cpu(smb->smb_buf_length);
        __u32 clc_len;  /* calculated length */
        cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
@@ -464,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
        if (check_smb_hdr(smb, mid))
                return 1;
-        clc_len = smbCalcSize_LE(smb);
+        clc_len = smbCalcSize(smb);
        if (4 + len != length) {
                cERROR(1, "Length read does not match RFC1001 length %d",
@@ -509,8 +507,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {
        struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
        struct list_head *tmp, *tmp1, *tmp2;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsInodeInfo *pCifsInode;
        struct cifsFileInfo *netfile;
@@ -521,7 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        (struct smb_com_transaction_change_notify_rsp *)buf;
                struct file_notify_information *pnotify;
                __u32 data_offset = 0;
-                if (get_bcc_le(buf) > sizeof(struct file_notify_information)) {
+                if (get_bcc(buf) > sizeof(struct file_notify_information)) {
                        data_offset = le32_to_cpu(pSMBr->DataOffset);
                        pnotify = (struct file_notify_information *)
@@ -568,9 +566,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        /* look up tcon based on tid & uid */
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &srv->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+                ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
                list_for_each(tmp1, &ses->tcon_list) {
-                        tcon = list_entry(tmp1, struct cifsTconInfo, tcon_list);
+                        tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
                        if (tcon->tid != buf->Tid)
                                continue;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 79f641eeda30..73e47e84b61a 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -836,7 +836,7 @@ ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
 }
 int
-map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
+map_smb_to_linux_error(struct smb_hdr *smb, bool logErr)
 {
        unsigned int i;
        int rc = -EIO;  /* if transport error smb error may not be set */
@@ -919,13 +919,6 @@ smbCalcSize(struct smb_hdr *ptr)
                2 /* size of the bcc field */ + get_bcc(ptr));
 }
-unsigned int
-smbCalcSize_LE(struct smb_hdr *ptr)
-{
-        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-                2 /* size of the bcc field */ + get_bcc_le(ptr));
-}
 /* The following are taken from fs/ntfs/util.c */
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f8e4cd2a7912..6751e745bbc6 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -195,7 +195,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
        int len;
        int oplock = 0;
        int rc;
-        struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
+        struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb);
        char *tmpbuffer;
        rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,7 +223,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
        struct cifsFileInfo *cifsFile;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        if (file->private_data == NULL) {
                tlink = cifs_sb_tlink(cifs_sb);
@@ -496,7 +496,7 @@ static int cifs_save_resume_key(const char *current_entry,
   assume that they are located in the findfirst return buffer.*/
 /* We start counting in the buffer with entry 2 and increment for every
   entry (do not increment for . or .. entry) */
-static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
+static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
        struct file *file, char **ppCurrentEntry, int *num_to_ret)
 {
        int rc = 0;
@@ -764,7 +764,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
        int rc = 0;
        int xid, i;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsFileInfo *cifsFile = NULL;
        char *current_entry;
        int num_to_fill = 0;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 645114ad0a10..3892ab817a36 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -37,13 +37,13 @@
 * the socket has been reestablished (so we know whether to use vc 0).
 * Called while holding the cifs_tcp_ses_lock, so do not block
 */
-static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+static bool is_first_ses_reconnect(struct cifs_ses *ses)
 {
        struct list_head *tmp;
-        struct cifsSesInfo *tmp_ses;
+        struct cifs_ses *tmp_ses;
        list_for_each(tmp, &ses->server->smb_ses_list) {
-                tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                tmp_ses = list_entry(tmp, struct cifs_ses,
                                     smb_ses_list);
                if (tmp_ses->need_reconnect == false)
                        return false;
@@ -61,11 +61,11 @@ static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
 *      any vc but zero (some servers reset the connection on vcnum zero)
 *
 */
-static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+static __le16 get_next_vcnum(struct cifs_ses *ses)
 {
        __u16 vcnum = 0;
        struct list_head *tmp;
-        struct cifsSesInfo *tmp_ses;
+        struct cifs_ses *tmp_ses;
        __u16 max_vcs = ses->server->max_vcs;
        __u16 i;
        int free_vc_found = 0;
@@ -87,7 +87,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
                free_vc_found = 1;
                list_for_each(tmp, &ses->server->smb_ses_list) {
-                        tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                        tmp_ses = list_entry(tmp, struct cifs_ses,
                                             smb_ses_list);
                        if (tmp_ses->vcnum == i) {
                                free_vc_found = 0;
@@ -114,7 +114,7 @@ get_vc_num_exit:
        return cpu_to_le16(vcnum);
 }
-static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
+static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
 {
        __u32 capabilities = 0;
@@ -136,7 +136,7 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
                        CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
            (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -181,7 +181,7 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
        *pbcc_area = bcc_ptr;
 }
-static void unicode_domain_string(char **pbcc_area, struct cifsSesInfo *ses,
+static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
                                   const struct nls_table *nls_cp)
 {
        char *bcc_ptr = *pbcc_area;
@@ -204,7 +204,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifsSesInfo *ses,
 }
-static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
+static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
                                   const struct nls_table *nls_cp)
 {
        char *bcc_ptr = *pbcc_area;
@@ -236,7 +236,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
        *pbcc_area = bcc_ptr;
 }
-static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
+static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
                                 const struct nls_table *nls_cp)
 {
        char *bcc_ptr = *pbcc_area;
@@ -276,7 +276,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 static void
-decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
                      const struct nls_table *nls_cp)
 {
        int len;
@@ -310,7 +310,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 }
 static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
-                               struct cifsSesInfo *ses,
+                               struct cifs_ses *ses,
                               const struct nls_table *nls_cp)
 {
        int rc = 0;
@@ -364,7 +364,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 }
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
-                                    struct cifsSesInfo *ses)
+                                    struct cifs_ses *ses)
 {
        unsigned int tioffset; /* challenge message target info area */
        unsigned int tilen; /* challenge message target info area length  */
@@ -411,7 +411,7 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 /* We do not malloc the blob, it is passed in pbuffer, because
   it is fixed size, and small, making this approach cleaner */
 static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
-                                         struct cifsSesInfo *ses)
+                                         struct cifs_ses *ses)
 {
        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
        __u32 flags;
@@ -424,7 +424,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
                if (!ses->server->session_estab)
@@ -449,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
   This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                                        u16 *buflen,
-                                   struct cifsSesInfo *ses,
+                                   struct cifs_ses *ses,
                                   const struct nls_table *nls_cp)
 {
        int rc;
@@ -464,10 +464,10 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
-        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+        if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED)
                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
@@ -551,7 +551,7 @@ setup_ntlmv2_ret:
 }
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
+CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
               const struct nls_table *nls_cp)
 {
        int rc = 0;
@@ -621,7 +621,7 @@ ssetup_ntlmssp_authenticate:
        and rest of bcc area. This allows us to avoid
        a large buffer 17K allocation */
        iov[0].iov_base = (char *)pSMB;
-        iov[0].iov_len = smb_buf->smb_buf_length + 4;
+        iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
        /* setting this here allows the code at the end of the function
           to free the request buffer if there's an error */
@@ -656,8 +656,8 @@ ssetup_ntlmssp_authenticate:
                 * to use challenge/response method (i.e. Password bit is 1).
                 */
-                calc_lanman_hash(ses->password, ses->server->cryptkey,
+                rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
-                                 ses->server->secMode & SECMODE_PW_ENCRYPT ?
+                                 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
                                        true : false, lnm_session_key);
                ses->flags |= CIFS_SES_LANMAN;
@@ -859,9 +859,10 @@ ssetup_ntlmssp_authenticate:
        iov[2].iov_len = (long) bcc_ptr - (long) str_area;
        count = iov[1].iov_len + iov[2].iov_len;
-        smb_buf->smb_buf_length += count;
+        smb_buf->smb_buf_length =
+                cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
-        put_bcc_le(count, smb_buf);
+        put_bcc(count, smb_buf);
        rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
                          CIFS_LOG_ERROR);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
deleted file mode 100644
index 04721485925d..000000000000
--- a/fs/cifs/smbdes.c
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-   a partial implementation of DES designed for use in the
-   SMB authentication protocol
-   Copyright (C) Andrew Tridgell 1998
-   Modified by Steve French (sfrench@us.ibm.com) 2002,2004
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-/* NOTES:
-   This code makes no attempt to be fast! In fact, it is a very
-   slow implementation
-   This code is NOT a complete DES implementation. It implements only
-   the minimum necessary for SMB authentication, as used by all SMB
-   products (including every copy of Microsoft Windows95 ever sold)
-   In particular, it can only do a unchained forward DES pass. This
-   means it is not possible to use this code for encryption/decryption
-   of data, instead it is only useful as a "hash" algorithm.
-   There is no entry point into this code that allows normal DES operation.
-   I believe this means that this code does not come under ITAR
-   regulations but this is NOT a legal opinion. If you are concerned
-   about the applicability of ITAR regulations to this code then you
-   should confirm it for yourself (and maybe let me know if you come
-   up with a different answer to the one above)
-*/
-#include <linux/slab.h>
-#define uchar unsigned char
-static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
-        1, 58, 50, 42, 34, 26, 18,
-        10, 2, 59, 51, 43, 35, 27,
-        19, 11, 3, 60, 52, 44, 36,
-        63, 55, 47, 39, 31, 23, 15,
-        7, 62, 54, 46, 38, 30, 22,
-        14, 6, 61, 53, 45, 37, 29,
-        21, 13, 5, 28, 20, 12, 4
-};
-static uchar perm2[48] = { 14, 17, 11, 24, 1, 5,
-        3, 28, 15, 6, 21, 10,
-        23, 19, 12, 4, 26, 8,
-        16, 7, 27, 20, 13, 2,
-        41, 52, 31, 37, 47, 55,
-        30, 40, 51, 45, 33, 48,
-        44, 49, 39, 56, 34, 53,
-        46, 42, 50, 36, 29, 32
-};
-static uchar perm3[64] = { 58, 50, 42, 34, 26, 18, 10, 2,
-        60, 52, 44, 36, 28, 20, 12, 4,
-        62, 54, 46, 38, 30, 22, 14, 6,
-        64, 56, 48, 40, 32, 24, 16, 8,
-        57, 49, 41, 33, 25, 17, 9, 1,
-        59, 51, 43, 35, 27, 19, 11, 3,
-        61, 53, 45, 37, 29, 21, 13, 5,
-        63, 55, 47, 39, 31, 23, 15, 7
-};
-static uchar perm4[48] = { 32, 1, 2, 3, 4, 5,
-        4, 5, 6, 7, 8, 9,
-        8, 9, 10, 11, 12, 13,
-        12, 13, 14, 15, 16, 17,
-        16, 17, 18, 19, 20, 21,
-        20, 21, 22, 23, 24, 25,
-        24, 25, 26, 27, 28, 29,
-        28, 29, 30, 31, 32, 1
-};
-static uchar perm5[32] = { 16, 7, 20, 21,
-        29, 12, 28, 17,
-        1, 15, 23, 26,
-        5, 18, 31, 10,
-        2, 8, 24, 14,
-        32, 27, 3, 9,
-        19, 13, 30, 6,
-        22, 11, 4, 25
-};
-static uchar perm6[64] = { 40, 8, 48, 16, 56, 24, 64, 32,
-        39, 7, 47, 15, 55, 23, 63, 31,
-        38, 6, 46, 14, 54, 22, 62, 30,
-        37, 5, 45, 13, 53, 21, 61, 29,
-        36, 4, 44, 12, 52, 20, 60, 28,
-        35, 3, 43, 11, 51, 19, 59, 27,
-        34, 2, 42, 10, 50, 18, 58, 26,
-        33, 1, 41, 9, 49, 17, 57, 25
-};
-static uchar sc[16] = { 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
-static uchar sbox[8][4][16] = {
-        {{14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7},
-         {0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8},
-         {4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0},
-         {15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13} },
-        {{15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10},
-         {3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5},
-         {0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15},
-         {13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9} },
-        {{10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8},
-         {13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1},
-         {13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7},
-         {1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12} },
-        {{7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15},
-         {13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9},
-         {10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4},
-         {3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14} },
-        {{2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9},
-         {14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6},
-         {4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14},
-         {11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3} },
-        {{12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11},
-         {10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8},
-         {9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6},
-         {4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13} },
-        {{4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1},
-         {13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6},
-         {1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2},
-         {6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12} },
-        {{13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7},
-         {1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2},
-         {7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8},
-         {2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11} }
-};
-static void
-permute(char *out, char *in, uchar *p, int n)
-{
-        int i;
-        for (i = 0; i < n; i++)
-                out[i] = in[p[i] - 1];
-}
-static void
-lshift(char *d, int count, int n)
-{
-        char out[64];
-        int i;
-        for (i = 0; i < n; i++)
-                out[i] = d[(i + count) % n];
-        for (i = 0; i < n; i++)
-                d[i] = out[i];
-}
-static void
-concat(char *out, char *in1, char *in2, int l1, int l2)
-{
-        while (l1--)
-                *out++ = *in1++;
-        while (l2--)
-                *out++ = *in2++;
-}
-static void
-xor(char *out, char *in1, char *in2, int n)
-{
-        int i;
-        for (i = 0; i < n; i++)
-                out[i] = in1[i] ^ in2[i];
-}
-static void
-dohash(char *out, char *in, char *key, int forw)
-{
-        int i, j, k;
-        char *pk1;
-        char c[28];
-        char d[28];
-        char *cd;
-        char (*ki)[48];
-        char *pd1;
-        char l[32], r[32];
-        char *rl;
-        /* Have to reduce stack usage */
-        pk1 = kmalloc(56+56+64+64, GFP_KERNEL);
-        if (pk1 == NULL)
-                return;
-        ki = kmalloc(16*48, GFP_KERNEL);
-        if (ki == NULL) {
-                kfree(pk1);
-                return;
-        }
-        cd = pk1 + 56;
-        pd1 = cd  + 56;
-        rl = pd1 + 64;
-        permute(pk1, key, perm1, 56);
-        for (i = 0; i < 28; i++)
-                c[i] = pk1[i];
-        for (i = 0; i < 28; i++)
-                d[i] = pk1[i + 28];
-        for (i = 0; i < 16; i++) {
-                lshift(c, sc[i], 28);
-                lshift(d, sc[i], 28);
-                concat(cd, c, d, 28, 28);
-                permute(ki[i], cd, perm2, 48);
-        }
-        permute(pd1, in, perm3, 64);
-        for (j = 0; j < 32; j++) {
-                l[j] = pd1[j];
-                r[j] = pd1[j + 32];
-        }
-        for (i = 0; i < 16; i++) {
-                char *er;  /* er[48]  */
-                char *erk; /* erk[48] */
-                char b[8][6];
-                char *cb;  /* cb[32]  */
-                char *pcb; /* pcb[32] */
-                char *r2;  /* r2[32]  */
-                er = kmalloc(48+48+32+32+32, GFP_KERNEL);
-                if (er == NULL) {
-                        kfree(pk1);
-                        kfree(ki);
-                        return;
-                }
-                erk = er+48;
-                cb  = erk+48;
-                pcb = cb+32;
-                r2  = pcb+32;
-                permute(er, r, perm4, 48);
-                xor(erk, er, ki[forw ? i : 15 - i], 48);
-                for (j = 0; j < 8; j++)
-                        for (k = 0; k < 6; k++)
-                                b[j][k] = erk[j * 6 + k];
-                for (j = 0; j < 8; j++) {
-                        int m, n;
-                        m = (b[j][0] << 1) | b[j][5];
-                        n = (b[j][1] << 3) | (b[j][2] << 2) | (b[j][3] <<
-                                                               1) | b[j][4];
-                        for (k = 0; k < 4; k++)
-                                b[j][k] =
-                                    (sbox[j][m][n] & (1 << (3 - k))) ? 1 : 0;
-                }
-                for (j = 0; j < 8; j++)
-                        for (k = 0; k < 4; k++)
-                                cb[j * 4 + k] = b[j][k];
-                permute(pcb, cb, perm5, 32);
-                xor(r2, l, pcb, 32);
-                for (j = 0; j < 32; j++)
-                        l[j] = r[j];
-                for (j = 0; j < 32; j++)
-                        r[j] = r2[j];
-                kfree(er);
-        }
-        concat(rl, r, l, 32, 32);
-        permute(out, rl, perm6, 64);
-        kfree(pk1);
-        kfree(ki);
-}
-static void
-str_to_key(unsigned char *str, unsigned char *key)
-{
-        int i;
-        key[0] = str[0] >> 1;
-        key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
-        key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
-        key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
-        key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
-        key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
-        key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
-        key[7] = str[6] & 0x7F;
-        for (i = 0; i < 8; i++)
-                key[i] = (key[i] << 1);
-}
-static void
-smbhash(unsigned char *out, const unsigned char *in, unsigned char *key,
-        int forw)
-{
-        int i;
-        char *outb; /* outb[64] */
-        char *inb;  /* inb[64]  */
-        char *keyb; /* keyb[64] */
-        unsigned char key2[8];
-        outb = kmalloc(64 * 3, GFP_KERNEL);
-        if (outb == NULL)
-                return;
-        inb  = outb + 64;
-        keyb = inb +  64;
-        str_to_key(key, key2);
-        for (i = 0; i < 64; i++) {
-                inb[i] = (in[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
-                keyb[i] = (key2[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
-                outb[i] = 0;
-        }
-        dohash(outb, inb, keyb, forw);
-        for (i = 0; i < 8; i++)
-                out[i] = 0;
-        for (i = 0; i < 64; i++) {
-                if (outb[i])
-                        out[i / 8] |= (1 << (7 - (i % 8)));
-        }
-        kfree(outb);
-}
-void
-E_P16(unsigned char *p14, unsigned char *p16)
-{
-        unsigned char sp8[8] =
-            { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
-        smbhash(p16, sp8, p14, 1);
-        smbhash(p16 + 8, sp8, p14 + 7, 1);
-}
-void
-E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
-{
-        smbhash(p24, c8, p21, 1);
-        smbhash(p24 + 8, c8, p21 + 7, 1);
-        smbhash(p24 + 16, c8, p21 + 14, 1);
-}
-#if 0 /* currently unused */
-static void
-D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
-        smbhash(out, in, p14, 0);
-        smbhash(out + 8, in + 8, p14 + 7, 0);
-}
-static void
-E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
-        smbhash(out, in, p14, 1);
-        smbhash(out + 8, in + 8, p14 + 7, 1);
-}
-/* these routines are currently unneeded, but may be
-        needed later */
-void
-cred_hash1(unsigned char *out, unsigned char *in, unsigned char *key)
-{
-        unsigned char buf[8];
-        smbhash(buf, in, key, 1);
-        smbhash(out, buf, key + 9, 1);
-}
-void
-cred_hash2(unsigned char *out, unsigned char *in, unsigned char *key)
-{
-        unsigned char buf[8];
-        static unsigned char key2[8];
-        smbhash(buf, in, key, 1);
-        key2[0] = key[7];
-        smbhash(out, buf, key2, 1);
-}
-void
-cred_hash3(unsigned char *out, unsigned char *in, unsigned char *key, int forw)
-{
-        static unsigned char key2[8];
-        smbhash(out, in, key, forw);
-        key2[0] = key[7];
-        smbhash(out + 8, in + 8, key2, forw);
-}
-#endif /* unneeded routines */
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5041c849981..1525d5e662b6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -47,6 +47,88 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
+static void
+str_to_key(unsigned char *str, unsigned char *key)
+{
+        int i;
+        key[0] = str[0] >> 1;
+        key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
+        key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
+        key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
+        key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
+        key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
+        key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
+        key[7] = str[6] & 0x7F;
+        for (i = 0; i < 8; i++)
+                key[i] = (key[i] << 1);
+}
+static int
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
+{
+        int rc;
+        unsigned char key2[8];
+        struct crypto_blkcipher *tfm_des;
+        struct scatterlist sgin, sgout;
+        struct blkcipher_desc desc;
+        str_to_key(key, key2);
+        tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+        if (IS_ERR(tfm_des)) {
+                rc = PTR_ERR(tfm_des);
+                cERROR(1, "could not allocate des crypto API\n");
+                goto smbhash_err;
+        }
+        desc.tfm = tfm_des;
+        crypto_blkcipher_setkey(tfm_des, key2, 8);
+        sg_init_one(&sgin, in, 8);
+        sg_init_one(&sgout, out, 8);
+        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+        if (rc) {
+                cERROR(1, "could not encrypt crypt key rc: %d\n", rc);
+                crypto_free_blkcipher(tfm_des);
+                goto smbhash_err;
+        }
+smbhash_err:
+        return rc;
+}
+static int
+E_P16(unsigned char *p14, unsigned char *p16)
+{
+        int rc;
+        unsigned char sp8[8] =
+            { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
+        rc = smbhash(p16, sp8, p14);
+        if (rc)
+                return rc;
+        rc = smbhash(p16 + 8, sp8, p14 + 7);
+        return rc;
+}
+static int
+E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
+{
+        int rc;
+        rc = smbhash(p24, c8, p21);
+        if (rc)
+                return rc;
+        rc = smbhash(p24 + 8, c8, p21 + 7);
+        if (rc)
+                return rc;
+        rc = smbhash(p24 + 16, c8, p21 + 14);
+        return rc;
+}
 /* produce a md4 message digest from data of length n bytes */
 int
 mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
@@ -87,40 +169,30 @@ mdfour_err:
        return rc;
 }
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-              unsigned char p24[24])
-{
-        unsigned char p21[21];
-        memset(p21, '\0', 21);
-        memcpy(p21, passwd, 16);
-        E_P24(p21, c8, p24);
-}
 /*
   This implements the X/Open SMB password encryption
   It takes a password, a 8 byte "crypt key" and puts 24 bytes of
   encrypted password into p24 */
 /* Note that password must be uppercased and null terminated */
-void
+int
 SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
 {
-        unsigned char p14[15], p21[21];
+        int rc;
+        unsigned char p14[14], p16[16], p21[21];
-        memset(p21, '\0', 21);
        memset(p14, '\0', 14);
-        strncpy((char *) p14, (char *) passwd, 14);
+        memset(p16, '\0', 16);
+        memset(p21, '\0', 21);
-/*      strupper((char *)p14); *//* BB at least uppercase the easy range */
+        memcpy(p14, passwd, 14);
-        E_P16(p14, p21);
+        rc = E_P16(p14, p16);
+        if (rc)
+                return rc;
-        SMBOWFencrypt(p21, c8, p24);
+        memcpy(p21, p16, 16);
+        rc = E_P24(p21, c8, p24);
-        memset(p14, 0, 15);
+        return rc;
-        memset(p21, 0, 21);
 }
 /* Routines for Windows NT MD4 Hash functions. */
@@ -279,16 +351,18 @@ int
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 {
        int rc;
-        unsigned char p21[21];
+        unsigned char p16[16], p21[21];
+        memset(p16, '\0', 16);
        memset(p21, '\0', 21);
-        rc = E_md4hash(passwd, p21);
+        rc = E_md4hash(passwd, p16);
        if (rc) {
                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
                return rc;
        }
-        SMBOWFencrypt(p21, c8, p24);
+        memcpy(p21, p16, 16);
+        rc = E_P24(p21, c8, p24);
        return rc;
 }
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 46d8756f2b24..147aa22c3c3a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -129,7 +129,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        unsigned int len = iov[0].iov_len;
        unsigned int total_len;
        int first_vec = 0;
-        unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+        unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length);
        struct socket *ssocket = server->ssocket;
        if (ssocket == NULL)
@@ -144,17 +144,10 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        else
                smb_msg.msg_flags = MSG_NOSIGNAL;
-        /* smb header is converted in header_assemble. bcc and rest of SMB word
-           area, and byte area if necessary, is converted to littleendian in
-           cifssmb.c and RFC1001 len is converted to bigendian in smb_send
-           Flags2 is converted in SendReceive */
        total_len = 0;
        for (i = 0; i < n_vec; i++)
                total_len += iov[i].iov_len;
-        smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
        cFYI(1, "Sending smb:  total_len %d", total_len);
        dump_smb(smb_buffer, len);
@@ -243,7 +236,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        /* Don't want to modify the buffer as a
           side effect of this call. */
-        smb_buffer->smb_buf_length = smb_buf_length;
+        smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length);
        return rc;
 }
@@ -302,7 +295,7 @@ static int wait_for_free_request(struct TCP_Server_Info *server,
        return 0;
 }
-static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
+static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
                        struct mid_q_entry **ppmidQ)
 {
        if (ses->server->tcpStatus == CifsExiting) {
@@ -349,22 +342,24 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 * the result. Caller is responsible for dealing with timeouts.
 */
 int
-cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-                mid_callback_t *callback, void *cbdata)
+                unsigned int nvec, mid_callback_t *callback, void *cbdata,
+                bool ignore_pend)
 {
        int rc;
        struct mid_q_entry *mid;
+        struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base;
-        rc = wait_for_free_request(server, CIFS_ASYNC_OP);
+        rc = wait_for_free_request(server, ignore_pend ? CIFS_ASYNC_OP : 0);
        if (rc)
                return rc;
        /* enable signing if server requires it */
-        if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+                hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        mutex_lock(&server->srv_mutex);
-        mid = AllocMidQEntry(in_buf, server);
+        mid = AllocMidQEntry(hdr, server);
        if (mid == NULL) {
                mutex_unlock(&server->srv_mutex);
                return -ENOMEM;
@@ -375,7 +370,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
        list_add_tail(&mid->qhead, &server->pending_mid_q);
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number);
        if (rc) {
                mutex_unlock(&server->srv_mutex);
                goto out_err;
@@ -387,7 +382,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&server->inSend);
 #endif
-        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+        rc = smb_sendv(server, iov, nvec);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&server->inSend);
        mid->when_sent = jiffies;
@@ -414,7 +409,7 @@ out_err:
 *
 */
 int
-SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
                struct smb_hdr *in_buf, int flags)
 {
        int rc;
@@ -422,7 +417,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        int resp_buf_type;
        iov[0].iov_base = (char *)in_buf;
-        iov[0].iov_len = in_buf->smb_buf_length + 4;
+        iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4;
        flags |= CIFS_NO_RESP;
        rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
        cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
@@ -431,7 +426,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 }
 static int
-sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 {
        int rc = 0;
@@ -439,28 +434,21 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
                mid->mid, mid->midState);
        spin_lock(&GlobalMid_Lock);
-        /* ensure that it's no longer on the pending_mid_q */
-        list_del_init(&mid->qhead);
        switch (mid->midState) {
        case MID_RESPONSE_RECEIVED:
                spin_unlock(&GlobalMid_Lock);
                return rc;
-        case MID_REQUEST_SUBMITTED:
-                /* socket is going down, reject all calls */
-                if (server->tcpStatus == CifsExiting) {
-                        cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
-                               __func__, mid->mid, mid->command, mid->midState);
-                        rc = -EHOSTDOWN;
-                        break;
-                }
        case MID_RETRY_NEEDED:
                rc = -EAGAIN;
                break;
        case MID_RESPONSE_MALFORMED:
                rc = -EIO;
                break;
+        case MID_SHUTDOWN:
+                rc = -EHOSTDOWN;
+                break;
        default:
+                list_del_init(&mid->qhead);
                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
                        mid->mid, mid->midState);
                rc = -EIO;
@@ -488,10 +476,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
        int rc = 0;
        /* -4 for RFC1001 length and +2 for BCC field */
-        in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
+        in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
        in_buf->Command = SMB_COM_NT_CANCEL;
        in_buf->WordCount = 0;
-        put_bcc_le(0, in_buf);
+        put_bcc(0, in_buf);
        mutex_lock(&server->srv_mutex);
        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
@@ -499,7 +487,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
                mutex_unlock(&server->srv_mutex);
                return rc;
        }
-        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+        rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
        mutex_unlock(&server->srv_mutex);
        cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
@@ -509,13 +497,31 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 }
 int
-SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
+cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+                   bool log_error)
+{
+        dump_smb(mid->resp_buf,
+                 min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length)));
+        /* convert the length into a more usable form */
+        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+                /* FIXME: add code to kill session */
+                if (cifs_verify_signature(mid->resp_buf, server,
+                                          mid->sequence_number + 1) != 0)
+                        cERROR(1, "Unexpected SMB signature");
+        }
+        /* BB special case reconnect tid and uid here? */
+        return map_smb_to_linux_error(mid->resp_buf, log_error);
+}
+int
+SendReceive2(const unsigned int xid, struct cifs_ses *ses,
             struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
             const int flags)
 {
        int rc = 0;
        int long_op;
-        unsigned int receive_len;
        struct mid_q_entry *midQ;
        struct smb_hdr *in_buf = iov[0].iov_base;
@@ -605,66 +611,31 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        cifs_small_buf_release(in_buf);
-        rc = sync_mid_result(midQ, ses->server);
+        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0) {
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        receive_len = midQ->resp_buf->smb_buf_length;
+        if (!midQ->resp_buf || midQ->midState != MID_RESPONSE_RECEIVED) {
-        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid);
                rc = -EIO;
+                cFYI(1, "Bad MID state?");
                goto out;
        }
-        /* rcvd frame is ok */
+        iov[0].iov_base = (char *)midQ->resp_buf;
+        iov[0].iov_len = be32_to_cpu(midQ->resp_buf->smb_buf_length) + 4;
-        if (midQ->resp_buf &&
+        if (midQ->largeBuf)
-            (midQ->midState == MID_RESPONSE_RECEIVED)) {
+                *pRespBufType = CIFS_LARGE_BUFFER;
+        else
-                iov[0].iov_base = (char *)midQ->resp_buf;
+                *pRespBufType = CIFS_SMALL_BUFFER;
-                if (midQ->largeBuf)
-                        *pRespBufType = CIFS_LARGE_BUFFER;
-                else
-                        *pRespBufType = CIFS_SMALL_BUFFER;
-                iov[0].iov_len = receive_len + 4;
-                dump_smb(midQ->resp_buf, 80);
-                /* convert the length into a more usable form */
-                if ((receive_len > 24) &&
-                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                             SECMODE_SIGN_ENABLED))) {
-                        rc = cifs_verify_signature(midQ->resp_buf,
-                                                ses->server,
-                                                midQ->sequence_number+1);
-                        if (rc) {
-                                cERROR(1, "Unexpected SMB signature");
-                                /* BB FIXME add code to kill session */
-                        }
-                }
-                /* BB special case reconnect tid and uid here? */
+        rc = cifs_check_receive(midQ, ses->server, flags & CIFS_LOG_ERROR);
-                rc = map_smb_to_linux_error(midQ->resp_buf,
-                                            flags & CIFS_LOG_ERROR);
-                /* convert ByteCount if necessary */
-                if (receive_len >= sizeof(struct smb_hdr) - 4
-                    /* do not count RFC1001 header */  +
-                    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
-                if ((flags & CIFS_NO_RESP) == 0)
-                        midQ->resp_buf = NULL;  /* mark it so buf will
-                                                   not be freed by
-                                                   delete_mid */
-        } else {
-                rc = -EIO;
-                cFYI(1, "Bad MID state?");
-        }
+        /* mark it so buf will not be freed by delete_mid */
+        if ((flags & CIFS_NO_RESP) == 0)
+                midQ->resp_buf = NULL;
 out:
        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
@@ -674,12 +645,11 @@ out:
 }
 int
-SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
+SendReceive(const unsigned int xid, struct cifs_ses *ses,
            struct smb_hdr *in_buf, struct smb_hdr *out_buf,
            int *pbytes_returned, const int long_op)
 {
        int rc = 0;
-        unsigned int receive_len;
        struct mid_q_entry *midQ;
        if (ses == NULL) {
@@ -698,9 +668,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
           to the same server. We may make this configurable later or
           use ses->maxReq */
-        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+        if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+                        MAX_CIFS_HDR_SIZE - 4) {
                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length);
+                           be32_to_cpu(in_buf->smb_buf_length));
                return -EIO;
        }
@@ -733,7 +704,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&ses->server->inSend);
 #endif
-        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
+        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
@@ -761,60 +732,23 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                spin_unlock(&GlobalMid_Lock);
        }
-        rc = sync_mid_result(midQ, ses->server);
+        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0) {
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        receive_len = midQ->resp_buf->smb_buf_length;
+        if (!midQ->resp_buf || !out_buf ||
+            midQ->midState != MID_RESPONSE_RECEIVED) {
-        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid);
-                rc = -EIO;
-                goto out;
-        }
-        /* rcvd frame is ok */
-        if (midQ->resp_buf && out_buf
-            && (midQ->midState == MID_RESPONSE_RECEIVED)) {
-                out_buf->smb_buf_length = receive_len;
-                memcpy((char *)out_buf + 4,
-                       (char *)midQ->resp_buf + 4,
-                       receive_len);
-                dump_smb(out_buf, 92);
-                /* convert the length into a more usable form */
-                if ((receive_len > 24) &&
-                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                             SECMODE_SIGN_ENABLED))) {
-                        rc = cifs_verify_signature(out_buf,
-                                                ses->server,
-                                                midQ->sequence_number+1);
-                        if (rc) {
-                                cERROR(1, "Unexpected SMB signature");
-                                /* BB FIXME add code to kill session */
-                        }
-                }
-                *pbytes_returned = out_buf->smb_buf_length;
-                /* BB special case reconnect tid and uid here? */
-                rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-                /* convert ByteCount if necessary */
-                if (receive_len >= sizeof(struct smb_hdr) - 4
-                    /* do not count RFC1001 header */  +
-                    (2 * out_buf->WordCount) + 2 /* bcc */ )
-                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
-        } else {
                rc = -EIO;
                cERROR(1, "Bad MID state?");
+                goto out;
        }
+        *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+        memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+        rc = cifs_check_receive(midQ, ses->server, 0);
 out:
        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
@@ -827,12 +761,12 @@ out:
   blocking lock to return. */
 static int
-send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
+send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
                        struct smb_hdr *in_buf,
                        struct smb_hdr *out_buf)
 {
        int bytes_returned;
-        struct cifsSesInfo *ses = tcon->ses;
+        struct cifs_ses *ses = tcon->ses;
        LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
        /* We just modify the current in_buf to change
@@ -849,15 +783,14 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
 }
 int
-SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
+SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
            struct smb_hdr *in_buf, struct smb_hdr *out_buf,
            int *pbytes_returned)
 {
        int rc = 0;
        int rstart = 0;
-        unsigned int receive_len;
        struct mid_q_entry *midQ;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        if (tcon == NULL || tcon->ses == NULL) {
                cERROR(1, "Null smb session");
@@ -877,9 +810,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
           to the same server. We may make this configurable later or
           use ses->maxReq */
-        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+        if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+                        MAX_CIFS_HDR_SIZE - 4) {
                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length);
+                           be32_to_cpu(in_buf->smb_buf_length));
                return -EIO;
        }
@@ -910,7 +844,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&ses->server->inSend);
 #endif
-        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
+        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
@@ -973,56 +907,20 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                rstart = 1;
        }
-        rc = sync_mid_result(midQ, ses->server);
+        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0)
                return rc;
-        receive_len = midQ->resp_buf->smb_buf_length;
-        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid);
-                rc = -EIO;
-                goto out;
-        }
        /* rcvd frame is ok */
+        if (out_buf == NULL || midQ->midState != MID_RESPONSE_RECEIVED) {
-        if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
                rc = -EIO;
                cERROR(1, "Bad MID state?");
                goto out;
        }
-        out_buf->smb_buf_length = receive_len;
+        *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
-        memcpy((char *)out_buf + 4,
+        memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
-               (char *)midQ->resp_buf + 4,
+        rc = cifs_check_receive(midQ, ses->server, 0);
-               receive_len);
-        dump_smb(out_buf, 92);
-        /* convert the length into a more usable form */
-        if ((receive_len > 24) &&
-            (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                     SECMODE_SIGN_ENABLED))) {
-                rc = cifs_verify_signature(out_buf,
-                                           ses->server,
-                                           midQ->sequence_number+1);
-                if (rc) {
-                        cERROR(1, "Unexpected SMB signature");
-                        /* BB FIXME add code to kill session */
-                }
-        }
-        *pbytes_returned = out_buf->smb_buf_length;
-        /* BB special case reconnect tid and uid here? */
-        rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-        /* convert ByteCount if necessary */
-        if (receive_len >= sizeof(struct smb_hdr) - 4
-            /* do not count RFC1001 header */  +
-            (2 * out_buf->WordCount) + 2 /* bcc */ )
-                put_bcc(get_bcc_le(out_buf), out_buf);
 out:
        delete_mid(midQ);
        if (rstart && rc == -EACCES)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index eae2a1491608..2a22fb2989e4 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -49,7 +49,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path = NULL;
@@ -109,9 +109,10 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path;
+        struct cifs_ntsd *pacl;
        if (direntry == NULL)
                return -EIO;
@@ -166,6 +167,25 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
                        (__u16)value_size, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+                        strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+                pacl = kmalloc(value_size, GFP_KERNEL);
+                if (!pacl) {
+                        cFYI(1, "%s: Can't allocate memory for ACL",
+                                        __func__);
+                        rc = -ENOMEM;
+                } else {
+#ifdef CONFIG_CIFS_ACL
+                        memcpy(pacl, ea_value, value_size);
+                        rc = set_cifs_acl(pacl, value_size,
+                                direntry->d_inode, full_path);
+                        if (rc == 0) /* force revalidate of the inode */
+                                CIFS_I(direntry->d_inode)->time = 0;
+                        kfree(pacl);
+#else
+                        cFYI(1, "Set CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
+                }
        } else {
                int temp;
                temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -220,7 +240,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -352,7 +372,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121e..a46126fd5735 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
        int len = de->d_name.len;
        int error;
+        dentry_unhash(de);
        error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
        if (!error) {
                /* VFS may delete the child */
@@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        int new_length = new_dentry->d_name.len;
        int error;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
                             coda_i2f(new_dir), old_length, new_length,
                             (const char *) old_name, (const char *)new_name);
diff --git a/fs/compat.c b/fs/compat.c
index 72fe6cda9108..0ea00832de23 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1306,241 +1306,6 @@ compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int
        return do_sys_open(dfd, filename, flags, mode);
 }
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-        int i = 0;
-        if (argv != NULL) {
-                for (;;) {
-                        compat_uptr_t p;
-                        if (get_user(p, argv))
-                                return -EFAULT;
-                        if (!p)
-                                break;
-                        argv++;
-                        if (i++ >= max)
-                                return -E2BIG;
-                        if (fatal_signal_pending(current))
-                                return -ERESTARTNOHAND;
-                        cond_resched();
-                }
-        }
-        return i;
-}
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-                                struct linux_binprm *bprm)
-{
-        struct page *kmapped_page = NULL;
-        char *kaddr = NULL;
-        unsigned long kpos = 0;
-        int ret;
-        while (argc-- > 0) {
-                compat_uptr_t str;
-                int len;
-                unsigned long pos;
-                if (get_user(str, argv+argc) ||
-                    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                if (len > MAX_ARG_STRLEN) {
-                        ret = -E2BIG;
-                        goto out;
-                }
-                /* We're going to work our way backwords. */
-                pos = bprm->p;
-                str += len;
-                bprm->p -= len;
-                while (len > 0) {
-                        int offset, bytes_to_copy;
-                        if (fatal_signal_pending(current)) {
-                                ret = -ERESTARTNOHAND;
-                                goto out;
-                        }
-                        cond_resched();
-                        offset = pos % PAGE_SIZE;
-                        if (offset == 0)
-                                offset = PAGE_SIZE;
-                        bytes_to_copy = offset;
-                        if (bytes_to_copy > len)
-                                bytes_to_copy = len;
-                        offset -= bytes_to_copy;
-                        pos -= bytes_to_copy;
-                        str -= bytes_to_copy;
-                        len -= bytes_to_copy;
-                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-                                struct page *page;
-                                page = get_arg_page(bprm, pos, 1);
-                                if (!page) {
-                                        ret = -E2BIG;
-                                        goto out;
-                                }
-                                if (kmapped_page) {
-                                        flush_kernel_dcache_page(kmapped_page);
-                                        kunmap(kmapped_page);
-                                        put_page(kmapped_page);
-                                }
-                                kmapped_page = page;
-                                kaddr = kmap(kmapped_page);
-                                kpos = pos & PAGE_MASK;
-                                flush_cache_page(bprm->vma, kpos,
-                                                 page_to_pfn(kmapped_page));
-                        }
-                        if (copy_from_user(kaddr+offset, compat_ptr(str),
-                                                bytes_to_copy)) {
-                                ret = -EFAULT;
-                                goto out;
-                        }
-                }
-        }
-        ret = 0;
-out:
-        if (kmapped_page) {
-                flush_kernel_dcache_page(kmapped_page);
-                kunmap(kmapped_page);
-                put_page(kmapped_page);
-        }
-        return ret;
-}
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-        compat_uptr_t __user *argv,
-        compat_uptr_t __user *envp,
-        struct pt_regs * regs)
-{
-        struct linux_binprm *bprm;
-        struct file *file;
-        struct files_struct *displaced;
-        bool clear_in_exec;
-        int retval;
-        retval = unshare_files(&displaced);
-        if (retval)
-                goto out_ret;
-        retval = -ENOMEM;
-        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-        if (!bprm)
-                goto out_files;
-        retval = prepare_bprm_creds(bprm);
-        if (retval)
-                goto out_free;
-        retval = check_unsafe_exec(bprm);
-        if (retval < 0)
-                goto out_free;
-        clear_in_exec = retval;
-        current->in_execve = 1;
-        file = open_exec(filename);
-        retval = PTR_ERR(file);
-        if (IS_ERR(file))
-                goto out_unmark;
-        sched_exec();
-        bprm->file = file;
-        bprm->filename = filename;
-        bprm->interp = filename;
-        retval = bprm_mm_init(bprm);
-        if (retval)
-                goto out_file;
-        bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-        if ((retval = bprm->argc) < 0)
-                goto out;
-        bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-        if ((retval = bprm->envc) < 0)
-                goto out;
-        retval = prepare_binprm(bprm);
-        if (retval < 0)
-                goto out;
-        retval = copy_strings_kernel(1, &bprm->filename, bprm);
-        if (retval < 0)
-                goto out;
-        bprm->exec = bprm->p;
-        retval = compat_copy_strings(bprm->envc, envp, bprm);
-        if (retval < 0)
-                goto out;
-        retval = compat_copy_strings(bprm->argc, argv, bprm);
-        if (retval < 0)
-                goto out;
-        retval = search_binary_handler(bprm, regs);
-        if (retval < 0)
-                goto out;
-        /* execve succeeded */
-        current->fs->in_exec = 0;
-        current->in_execve = 0;
-        acct_update_integrals(current);
-        free_bprm(bprm);
-        if (displaced)
-                put_files_struct(displaced);
-        return retval;
-out:
-        if (bprm->mm) {
-                acct_arg_size(bprm, 0);
-                mmput(bprm->mm);
-        }
-out_file:
-        if (bprm->file) {
-                allow_write_access(bprm->file);
-                fput(bprm->file);
-        }
-out_unmark:
-        if (clear_in_exec)
-                current->fs->in_exec = 0;
-        current->in_execve = 0;
-out_free:
-        free_bprm(bprm);
-out_files:
-        if (displaced)
-                reset_files_struct(displaced);
-out_ret:
-        return retval;
-}
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b6de3a..9d17d350abc5 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct module *subsys_owner = NULL, *dead_item_owner = NULL;
        int ret;
+        dentry_unhash(dentry);
        if (dentry->d_parent == configfs_sb->s_root)
                return -EPERM;
diff --git a/fs/dcache.c b/fs/dcache.c
index 22a0ef41bad1..37f72ee5bf7c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,6 +35,7 @@
 #include <linux/hardirq.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rculist_bl.h>
+#include <linux/prefetch.h>
 #include "internal.h"
 /*
@@ -1219,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent)
 EXPORT_SYMBOL(shrink_dcache_parent);
 /*
- * Scan `nr' dentries and return the number which remain.
+ * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
 *
 * We need to avoid reentering the filesystem if the caller is performing a
 * GFP_NOFS allocation attempt.  One example deadlock is:
@@ -1230,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 *
 * In this case we return -1 to tell the caller that we baled.
 */
-static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink,
+                                struct shrink_control *sc)
 {
+        int nr = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 89d394d8fe24..90f76575c056 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -428,26 +428,17 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
                               size_t count, loff_t *ppos)
 {
        char buf[32];
-        int buf_size;
+        size_t buf_size;
+        bool bv;
        u32 *val = file->private_data;
        buf_size = min(count, (sizeof(buf)-1));
        if (copy_from_user(buf, user_buf, buf_size))
                return -EFAULT;
-        switch (buf[0]) {
+        if (strtobool(buf, &bv) == 0)
-        case 'y':
+                *val = bv;
-        case 'Y':
-        case '1':
-                *val = 1;
-                break;
-        case 'n':
-        case 'N':
-        case '0':
-                *val = 0;
-                break;
-        }
-        
        return count;
 }
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff8ed4c..9b026ea8baa9 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
        unsigned int cl_log_debug;
        unsigned int cl_protocol;
        unsigned int cl_timewarn_cs;
+        unsigned int cl_waitwarn_us;
 };
 enum {
@@ -114,6 +115,7 @@ enum {
        CLUSTER_ATTR_LOG_DEBUG,
        CLUSTER_ATTR_PROTOCOL,
        CLUSTER_ATTR_TIMEWARN_CS,
+        CLUSTER_ATTR_WAITWARN_US,
 };
 struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
 CLUSTER_ATTR(protocol, 0);
 CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
 static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
        [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
        [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+        [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
        NULL,
 };
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
        cl->cl_log_debug = dlm_config.ci_log_debug;
        cl->cl_protocol = dlm_config.ci_protocol;
        cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+        cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
        space_list = &sps->ss_group;
        comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_LOG_DEBUG          0
 #define DEFAULT_PROTOCOL           0
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US        0
 struct dlm_config_info dlm_config = {
        .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
        .ci_scan_secs = DEFAULT_SCAN_SECS,
        .ci_log_debug = DEFAULT_LOG_DEBUG,
        .ci_protocol = DEFAULT_PROTOCOL,
-        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS
+        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+        .ci_waitwarn_us = DEFAULT_WAITWARN_US
 };
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fce58c5..dd0ce24d5a80 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
        int ci_log_debug;
        int ci_protocol;
        int ci_timewarn_cs;
+        int ci_waitwarn_us;
 };
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b94204913011..0262451eb9c6 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
 #define DLM_IFL_WATCH_TIMEWARN  0x00400000
 #define DLM_IFL_TIMEOUT_CANCEL  0x00800000
 #define DLM_IFL_DEADLOCK_CANCEL 0x01000000
+#define DLM_IFL_STUB_MS         0x02000000 /* magic number for m_flags */
 #define DLM_IFL_USER            0x00000001
 #define DLM_IFL_ORPHAN          0x00000002
@@ -245,6 +246,7 @@ struct dlm_lkb {
        int8_t                  lkb_wait_type;  /* type of reply waiting for */
        int8_t                  lkb_wait_count;
+        int                     lkb_wait_nodeid; /* for debugging */
        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
        struct list_head        lkb_statequeue; /* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
        ktime_t                 lkb_timestamp;
+        ktime_t                 lkb_wait_time;
        unsigned long           lkb_timeout_cs;
        struct dlm_callback     lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfcc1e48..f71d0b5abd95 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
        return -1;
 }
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+        int i;
+        for (i = 0; i < num_nodes; i++) {
+                if (!warned[i]) {
+                        warned[i] = nodeid;
+                        return 0;
+                }
+                if (warned[i] == nodeid)
+                        return 1;
+        }
+        return 0;
+}
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb;
+        ktime_t zero = ktime_set(0, 0);
+        s64 us;
+        s64 debug_maxus = 0;
+        u32 debug_scanned = 0;
+        u32 debug_expired = 0;
+        int num_nodes = 0;
+        int *warned = NULL;
+        if (!dlm_config.ci_waitwarn_us)
+                return;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (ktime_equal(lkb->lkb_wait_time, zero))
+                        continue;
+                debug_scanned++;
+                us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+                if (us < dlm_config.ci_waitwarn_us)
+                        continue;
+                lkb->lkb_wait_time = zero;
+                debug_expired++;
+                if (us > debug_maxus)
+                        debug_maxus = us;
+                if (!num_nodes) {
+                        num_nodes = ls->ls_num_nodes;
+                        warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
+                        if (warned)
+                                memset(warned, 0, num_nodes * sizeof(int));
+                }
+                if (!warned)
+                        continue;
+                if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+                        continue;
+                log_error(ls, "waitwarn %x %lld %d us check connection to "
+                          "node %d", lkb->lkb_id, (long long)us,
+                          dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+        if (warned)
+                kfree(warned);
+        if (debug_expired)
+                log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+                          debug_scanned, debug_expired,
+                          dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
 /* add/remove lkb from global waiters list of lkb's waiting for
   a reply from a remote node */
-static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
        lkb->lkb_wait_count++;
        lkb->lkb_wait_type = mstype;
+        lkb->lkb_wait_time = ktime_get();
+        lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
        hold_lkb(lkb);
        list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
 out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int error;
-        if (ms != &ls->ls_stub_ms)
+        if (ms->m_flags != DLM_IFL_STUB_MS)
                mutex_lock(&ls->ls_waiters_mutex);
        error = _remove_from_waiters(lkb, ms->m_type, ms);
-        if (ms != &ls->ls_stub_ms)
+        if (ms->m_flags != DLM_IFL_STUB_MS)
                mutex_unlock(&ls->ls_waiters_mutex);
        return error;
 }
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
        list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
                lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
        mutex_unlock(&ls->ls_timeout_mutex);
+        if (!dlm_config.ci_waitwarn_us)
+                return;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (ktime_to_us(lkb->lkb_wait_time))
+                        lkb->lkb_wait_time = ktime_get();
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
 }
 /* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
   compatible with other granted locks */
-static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void munge_demoted(struct dlm_lkb *lkb)
 {
-        if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
-                log_print("munge_demoted %x invalid reply type %d",
-                          lkb->lkb_id, ms->m_type);
-                return;
-        }
        if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
                log_print("munge_demoted %x invalid modes gr %d rq %d",
                          lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        error = add_to_waiters(lkb, mstype);
+        to_nodeid = r->res_nodeid;
+        error = add_to_waiters(lkb, mstype, to_nodeid);
        if (error)
                return error;
-        to_nodeid = r->res_nodeid;
        error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
        if (error)
                goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
        /* down conversions go without a reply from the master */
        if (!error && down_conversion(lkb)) {
                remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+                r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
                r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
                r->res_ls->ls_stub_ms.m_result = 0;
-                r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
                __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
        }
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+        to_nodeid = dlm_dir_nodeid(r);
+        error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
        if (error)
                return error;
-        to_nodeid = dlm_dir_nodeid(r);
        error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
        if (error)
                goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
+        if (ms->m_flags == DLM_IFL_STUB_MS)
+                return;
        lkb->lkb_sbflags = ms->m_sbflags;
        lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
                         (ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                /* convert was queued on remote master */
                receive_flags_reply(lkb, ms);
                if (is_demoted(lkb))
-                        munge_demoted(lkb, ms);
+                        munge_demoted(lkb);
                del_lkb(r, lkb);
                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
                add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                /* convert was granted on remote master */
                receive_flags_reply(lkb, ms);
                if (is_demoted(lkb))
-                        munge_demoted(lkb, ms);
+                        munge_demoted(lkb);
                grant_lock_pc(r, lkb, ms);
                queue_cast(r, lkb, 0);
                break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
        dlm_put_lockspace(ls);
 }
-static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                   struct dlm_message *ms_stub)
 {
        if (middle_conversion(lkb)) {
                hold_lkb(lkb);
-                ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
+                memset(ms_stub, 0, sizeof(struct dlm_message));
-                ls->ls_stub_ms.m_result = -EINPROGRESS;
+                ms_stub->m_flags = DLM_IFL_STUB_MS;
-                ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
-                ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                ms_stub->m_result = -EINPROGRESS;
-                _receive_convert_reply(lkb, &ls->ls_stub_ms);
+                ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                _receive_convert_reply(lkb, ms_stub);
                /* Same special case as in receive_rcom_lock_args() */
                lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb, *safe;
+        struct dlm_message *ms_stub;
        int wait_type, stub_unlock_result, stub_cancel_result;
+        ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
+        if (!ms_stub) {
+                log_error(ls, "dlm_recover_waiters_pre no mem");
+                return;
+        }
        mutex_lock(&ls->ls_waiters_mutex);
        list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
-                log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
-                          lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+                /* exclude debug messages about unlocks because there can be so
+                   many and they aren't very interesting */
+                if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+                        log_debug(ls, "recover_waiter %x nodeid %d "
+                                  "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
+                                  lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+                }
                /* all outstanding lookups, regardless of destination  will be
                   resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                        break;
                case DLM_MSG_CONVERT:
-                        recover_convert_waiter(ls, lkb);
+                        recover_convert_waiter(ls, lkb, ms_stub);
                        break;
                case DLM_MSG_UNLOCK:
                        hold_lkb(lkb);
-                        ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
+                        memset(ms_stub, 0, sizeof(struct dlm_message));
-                        ls->ls_stub_ms.m_result = stub_unlock_result;
+                        ms_stub->m_flags = DLM_IFL_STUB_MS;
-                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
-                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                        ms_stub->m_result = stub_unlock_result;
-                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+                        ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                        _receive_unlock_reply(lkb, ms_stub);
                        dlm_put_lkb(lkb);
                        break;
                case DLM_MSG_CANCEL:
                        hold_lkb(lkb);
-                        ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
+                        memset(ms_stub, 0, sizeof(struct dlm_message));
-                        ls->ls_stub_ms.m_result = stub_cancel_result;
+                        ms_stub->m_flags = DLM_IFL_STUB_MS;
-                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
-                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                        ms_stub->m_result = stub_cancel_result;
-                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+                        ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                        _receive_cancel_reply(lkb, ms_stub);
                        dlm_put_lkb(lkb);
                        break;
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                schedule();
        }
        mutex_unlock(&ls->ls_waiters_mutex);
+        kfree(ms_stub);
 }
 static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
                ou = is_overlap_unlock(lkb);
                err = 0;
-                log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+                log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
-                          lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+                          lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
                /* At this point we assume that we won't get a reply to any
                   previous op or overlap op on this lock.  First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c80cc22..265017a7c3e7 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
 void dlm_scan_rsbs(struct dlm_ls *ls);
 int dlm_lock_recovery_try(struct dlm_ls *ls);
 void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
 void dlm_scan_timeout(struct dlm_ls *ls);
 void dlm_adjust_timeouts(struct dlm_ls *ls);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7dfda85..14cbf4099753 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
 static int dlm_scand(void *data)
 {
        struct dlm_ls *ls;
-        int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
        while (!kthread_should_stop()) {
                ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
                                ls->ls_scan_time = jiffies;
                                dlm_scan_rsbs(ls);
                                dlm_scan_timeout(ls);
+                                dlm_scan_waiters(ls);
                                dlm_unlock_recovery(ls);
                        } else {
                                ls->ls_scan_time += HZ;
                        }
-                } else {
+                        continue;
-                        schedule_timeout_interruptible(timeout_jiffies);
                }
+                schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
        }
        return 0;
 }
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index b80e0aa3cfa5..5a59efa0bb46 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -50,7 +50,7 @@ static int __init init_dlm(void)
        if (error)
                goto out_netlink;
-        printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+        printk("DLM installed\n");
        return 0;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85febbf..e2b878004364 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
        wake_up(&send_wq);
 }
+/* If a process was killed while waiting for the only plock on a file,
+   locks_remove_posix will not see any lock on the file so it won't
+   send an unlock-close to us to pass on to userspace to clean up the
+   abandoned waiter.  So, we have to insert the unlock-close when the
+   lock call is interrupted. */
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+                            struct file *file, struct file_lock *fl)
+{
+        struct plock_op *op;
+        op = kzalloc(sizeof(*op), GFP_NOFS);
+        if (!op)
+                return;
+        op->info.optype         = DLM_PLOCK_OP_UNLOCK;
+        op->info.pid            = fl->fl_pid;
+        op->info.fsid           = ls->ls_global_id;
+        op->info.number         = number;
+        op->info.start          = 0;
+        op->info.end            = OFFSET_MAX;
+        if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+                op->info.owner  = (__u64) fl->fl_pid;
+        else
+                op->info.owner  = (__u64)(long) fl->fl_owner;
+        op->info.flags |= DLM_PLOCK_FL_CLOSE;
+        send_op(op);
+}
 int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
                   int cmd, struct file_lock *fl)
 {
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        send_op(op);
-        if (xop->callback == NULL)
+        if (xop->callback == NULL) {
-                wait_event(recv_wq, (op->done != 0));
+                rv = wait_event_killable(recv_wq, (op->done != 0));
-        else {
+                if (rv == -ERESTARTSYS) {
+                        log_debug(ls, "dlm_posix_lock: wait killed %llx",
+                                  (unsigned long long)number);
+                        spin_lock(&ops_lock);
+                        list_del(&op->list);
+                        spin_unlock(&ops_lock);
+                        kfree(xop);
+                        do_unlock_close(ls, number, file, fl);
+                        goto out;
+                }
+        } else {
                rv = FILE_LOCK_DEFERRED;
                goto out;
        }
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        else
                op->info.owner  = (__u64)(long) fl->fl_owner;
+        if (fl->fl_flags & FL_CLOSE) {
+                op->info.flags |= DLM_PLOCK_FL_CLOSE;
+                send_op(op);
+                rv = 0;
+                goto out;
+        }
        send_op(op);
        wait_event(recv_wq, (op->done != 0));
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
        spin_lock(&ops_lock);
        if (!list_empty(&send_list)) {
                op = list_entry(send_list.next, struct plock_op, list);
-                list_move(&op->list, &recv_list);
+                if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+                        list_del(&op->list);
+                else
+                        list_move(&op->list, &recv_list);
                memcpy(&info, &op->info, sizeof(info));
        }
        spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
        if (!op)
                return -EAGAIN;
+        /* there is no need to get a reply from userspace for unlocks
+           that were generated by the vfs cleaning up for a close
+           (the process did not make an unlock call). */
+        if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+                kfree(op);
        if (copy_to_user(u, &info, sizeof(info)))
                return -EFAULT;
        return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe7c198..e96bf3e9be88 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 out_sig:
        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
-        recalc_sigpending();
 out_free:
        kfree(kbuf);
        return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c89494c..c00e055b6282 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 static void drop_slab(void)
 {
        int nr_objects;
+        struct shrink_control shrink = {
+                .gfp_mask = GFP_KERNEL,
+        };
        do {
-                nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+                nr_objects = shrink_slab(&shrink, 1000, 1000);
        } while (nr_objects > 10);
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4d4cc6a90cd5..bc116b9ffcf2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,12 +521,16 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct dentry *lower_dir_dentry;
        int rc;
+        dentry_unhash(dentry);
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        dget(dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
        dget(lower_dentry);
        rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
        dput(lower_dentry);
+        if (!rc && dentry->d_inode)
+                clear_nlink(dentry->d_inode);
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
        dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
        unlock_dir(lower_dir_dentry);
@@ -571,6 +575,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct dentry *lower_new_dir_dentry;
        struct dentry *trap = NULL;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
        lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
        dget(lower_old_dentry);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 03e609c45012..27a7fefb83eb 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -599,8 +599,8 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
        struct mutex *tfm_mutex;
        char *block_aligned_filename;
        struct ecryptfs_auth_tok *auth_tok;
-        struct scatterlist src_sg;
+        struct scatterlist src_sg[2];
-        struct scatterlist dst_sg;
+        struct scatterlist dst_sg[2];
        struct blkcipher_desc desc;
        char iv[ECRYPTFS_MAX_IV_BYTES];
        char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
@@ -816,23 +816,21 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
               filename_size);
        rc = virt_to_scatterlist(s->block_aligned_filename,
-                                 s->block_aligned_filename_size, &s->src_sg, 1);
+                                 s->block_aligned_filename_size, s->src_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
-                       "convert filename memory to scatterlist; "
+                       "convert filename memory to scatterlist; rc = [%d]. "
-                       "expected rc = 1; got rc = [%d]. "
                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
                       s->block_aligned_filename_size);
                goto out_release_free_unlock;
        }
        rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
-                                 &s->dst_sg, 1);
+                                 s->dst_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
                       "convert encrypted filename memory to scatterlist; "
-                       "expected rc = 1; got rc = [%d]. "
+                       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
-                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       __func__, rc, s->block_aligned_filename_size);
-                       s->block_aligned_filename_size);
                goto out_release_free_unlock;
        }
        /* The characters in the first block effectively do the job
@@ -855,7 +853,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
                goto out_release_free_unlock;
        }
-        rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+        rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg,
                                         s->block_aligned_filename_size);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to encrypt filename; "
@@ -891,8 +889,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
        struct mutex *tfm_mutex;
        char *decrypted_filename;
        struct ecryptfs_auth_tok *auth_tok;
-        struct scatterlist src_sg;
+        struct scatterlist src_sg[2];
-        struct scatterlist dst_sg;
+        struct scatterlist dst_sg[2];
        struct blkcipher_desc desc;
        char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
        char iv[ECRYPTFS_MAX_IV_BYTES];
@@ -1008,13 +1006,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
        }
        mutex_lock(s->tfm_mutex);
        rc = virt_to_scatterlist(&data[(*packet_size)],
-                                 s->block_aligned_filename_size, &s->src_sg, 1);
+                                 s->block_aligned_filename_size, s->src_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
                       "convert encrypted filename memory to scatterlist; "
-                       "expected rc = 1; got rc = [%d]. "
+                       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
-                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       __func__, rc, s->block_aligned_filename_size);
-                       s->block_aligned_filename_size);
                goto out_unlock;
        }
        (*packet_size) += s->block_aligned_filename_size;
@@ -1028,13 +1025,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                goto out_unlock;
        }
        rc = virt_to_scatterlist(s->decrypted_filename,
-                                 s->block_aligned_filename_size, &s->dst_sg, 1);
+                                 s->block_aligned_filename_size, s->dst_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
                       "convert decrypted filename memory to scatterlist; "
-                       "expected rc = 1; got rc = [%d]. "
+                       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
-                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       __func__, rc, s->block_aligned_filename_size);
-                       s->block_aligned_filename_size);
                goto out_free_unlock;
        }
        /* The characters in the first block effectively do the job of
@@ -1065,7 +1061,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
                goto out_free_unlock;
        }
-        rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+        rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg,
                                         s->block_aligned_filename_size);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to decrypt filename; "
diff --git a/fs/exec.c b/fs/exec.c
index 5e62d26a4fec..ea5f748906a8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -42,7 +42,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
 #include <linux/namei.h>
-#include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -55,6 +54,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -166,8 +166,13 @@ out:
 }
 #ifdef CONFIG_MMU
+/*
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
        struct mm_struct *mm = current->mm;
        long diff = (long)(pages - bprm->vma_pages);
@@ -186,7 +191,7 @@ void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 #endif
 }
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -194,7 +199,7 @@ struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 #ifdef CONFIG_STACK_GROWSUP
        if (write) {
-                ret = expand_stack_downwards(bprm->vma, pos);
+                ret = expand_downwards(bprm->vma, pos);
                if (ret < 0)
                        return NULL;
        }
@@ -305,11 +310,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
 #else
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -398,22 +403,56 @@ err:
        return err;
 }
+struct user_arg_ptr {
+#ifdef CONFIG_COMPAT
+        bool is_compat;
+#endif
+        union {
+                const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+                compat_uptr_t __user *compat;
+#endif
+        } ptr;
+};
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
+{
+        const char __user *native;
+#ifdef CONFIG_COMPAT
+        if (unlikely(argv.is_compat)) {
+                compat_uptr_t compat;
+                if (get_user(compat, argv.ptr.compat + nr))
+                        return ERR_PTR(-EFAULT);
+                return compat_ptr(compat);
+        }
+#endif
+        if (get_user(native, argv.ptr.native + nr))
+                return ERR_PTR(-EFAULT);
+        return native;
+}
 /*
 * count() counts the number of strings in array ARGV.
 */
-static int count(const char __user * const __user * argv, int max)
+static int count(struct user_arg_ptr argv, int max)
 {
        int i = 0;
-        if (argv != NULL) {
+        if (argv.ptr.native != NULL) {
                for (;;) {
-                        const char __user * p;
+                        const char __user *p = get_user_arg_ptr(argv, i);
-                        if (get_user(p, argv))
-                                return -EFAULT;
                        if (!p)
                                break;
-                        argv++;
+                        if (IS_ERR(p))
+                                return -EFAULT;
                        if (i++ >= max)
                                return -E2BIG;
@@ -430,7 +469,7 @@ static int count(const char __user * const __user * argv, int max)
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
-static int copy_strings(int argc, const char __user *const __user *argv,
+static int copy_strings(int argc, struct user_arg_ptr argv,
                        struct linux_binprm *bprm)
 {
        struct page *kmapped_page = NULL;
@@ -443,16 +482,18 @@ static int copy_strings(int argc, const char __user *const __user *argv,
                int len;
                unsigned long pos;
-                if (get_user(str, argv+argc) ||
+                ret = -EFAULT;
-                                !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
+                str = get_user_arg_ptr(argv, argc);
-                        ret = -EFAULT;
+                if (IS_ERR(str))
                        goto out;
-                }
-                if (!valid_arg_len(bprm, len)) {
+                len = strnlen_user(str, MAX_ARG_STRLEN);
-                        ret = -E2BIG;
+                if (!len)
+                        goto out;
+                ret = -E2BIG;
+                if (!valid_arg_len(bprm, len))
                        goto out;
-                }
                /* We're going to work our way backwords. */
                pos = bprm->p;
@@ -519,14 +560,19 @@ out:
 /*
 * Like copy_strings, but get argv and its values from kernel memory.
 */
-int copy_strings_kernel(int argc, const char *const *argv,
+int copy_strings_kernel(int argc, const char *const *__argv,
                        struct linux_binprm *bprm)
 {
        int r;
        mm_segment_t oldfs = get_fs();
+        struct user_arg_ptr argv = {
+                .ptr.native = (const char __user *const  __user *)__argv,
+        };
        set_fs(KERNEL_DS);
-        r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+        r = copy_strings(argc, argv, bprm);
        set_fs(oldfs);
        return r;
 }
 EXPORT_SYMBOL(copy_strings_kernel);
@@ -553,7 +599,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        BUG_ON(new_start > new_end);
@@ -579,12 +625,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                return -ENOMEM;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
-                free_pgd_range(tlb, new_end, old_end, new_end,
+                free_pgd_range(&tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        } else {
                /*
@@ -593,10 +639,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
-                free_pgd_range(tlb, old_start, old_end, new_end,
+                free_pgd_range(&tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        }
-        tlb_finish_mmu(tlb, new_end, old_end);
+        tlb_finish_mmu(&tlb, new_end, old_end);
        /*
         * Shrink the vma to just the new range.  Always succeeds.
@@ -1004,6 +1050,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
        task_unlock(tsk);
        return buf;
 }
+EXPORT_SYMBOL_GPL(get_task_comm);
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
@@ -1379,10 +1426,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
 * sys_execve() executes a new program.
 */
-int do_execve(const char * filename,
+static int do_execve_common(const char *filename,
-        const char __user *const __user *argv,
+                                struct user_arg_ptr argv,
-        const char __user *const __user *envp,
+                                struct user_arg_ptr envp,
-        struct pt_regs * regs)
+                                struct pt_regs *regs)
 {
        struct linux_binprm *bprm;
        struct file *file;
@@ -1489,6 +1536,34 @@ out_ret:
        return retval;
 }
+int do_execve(const char *filename,
+        const char __user *const __user *__argv,
+        const char __user *const __user *__envp,
+        struct pt_regs *regs)
+{
+        struct user_arg_ptr argv = { .ptr.native = __argv };
+        struct user_arg_ptr envp = { .ptr.native = __envp };
+        return do_execve_common(filename, argv, envp, regs);
+}
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+        compat_uptr_t __user *__argv,
+        compat_uptr_t __user *__envp,
+        struct pt_regs *regs)
+{
+        struct user_arg_ptr argv = {
+                .is_compat = true,
+                .ptr.compat = __argv,
+        };
+        struct user_arg_ptr envp = {
+                .is_compat = true,
+                .ptr.compat = __envp,
+        };
+        return do_execve_common(filename, argv, envp, regs);
+}
+#endif
 void set_binfmt(struct linux_binfmt *new)
 {
        struct mm_struct *mm = current->mm;
@@ -1548,6 +1623,41 @@ expand_fail:
        return ret;
 }
+static int cn_print_exe_file(struct core_name *cn)
+{
+        struct file *exe_file;
+        char *pathbuf, *path, *p;
+        int ret;
+        exe_file = get_mm_exe_file(current->mm);
+        if (!exe_file)
+                return cn_printf(cn, "(unknown)");
+        pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+        if (!pathbuf) {
+                ret = -ENOMEM;
+                goto put_exe_file;
+        }
+        path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+        if (IS_ERR(path)) {
+                ret = PTR_ERR(path);
+                goto free_buf;
+        }
+        for (p = path; *p; p++)
+                if (*p == '/')
+                        *p = '!';
+        ret = cn_printf(cn, "%s", path);
+free_buf:
+        kfree(pathbuf);
+put_exe_file:
+        fput(exe_file);
+        return ret;
+}
 /* format_corename will inspect the pattern parameter, and output a
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
@@ -1619,6 +1729,9 @@ static int format_corename(struct core_name *cn, long signr)
                        case 'e':
                                err = cn_printf(cn, "%s", current->comm);
                                break;
+                        case 'E':
+                                err = cn_print_exe_file(cn);
+                                break;
                        /* core limit size */
                        case 'c':
                                err = cn_printf(cn, "%lu",
@@ -1659,6 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code)
        t = start;
        do {
+                task_clear_group_stop_pending(t);
                if (t != current && t->mm) {
                        sigaddset(&t->pending.signal, SIGKILL);
                        signal_wake_up(t, 1);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae7e2cb..1dd62ed35b85 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                brelse(bh);
                if (!sb_set_blocksize(sb, blocksize)) {
-                        ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
+                        ext2_msg(sb, KERN_ERR,
+                                "error: bad blocksize %d", blocksize);
                        goto failed_sbi;
                }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b8695859..34b6d9bfc48a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        frame->at = entries;
        frame->bh = bh;
        bh = bh2;
+        /*
+         * Mark buffers dirty here so that if do_split() fails we write a
+         * consistent set of buffers to disk.
+         */
+        ext3_journal_dirty_metadata(handle, frame->bh);
+        ext3_journal_dirty_metadata(handle, bh);
        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-        dx_release (frames);
+        if (!de) {
-        if (!(de))
+                ext3_mark_inode_dirty(handle, dir);
+                dx_release(frames);
                return retval;
+        }
+        dx_release(frames);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
        handle_t *handle;
        struct inode * inode;
        int l, err, retries = 0;
+        int credits;
        l = strlen(symname)+1;
        if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
        dquot_initialize(dir);
+        if (l > EXT3_N_BLOCKS * 4) {
+                /*
+                 * For non-fast symlinks, we just allocate inode and put it on
+                 * orphan list in the first transaction => we need bitmap,
+                 * group descriptor, sb, inode block, quota blocks.
+                 */
+                credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        } else {
+                /*
+                 * Fast symlink. We have to add entry to directory
+                 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+                 * allocate new inode (bitmap, group descriptor, inode block,
+                 * quota blocks, sb is already counted in previous macros).
+                 */
+                credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                          EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                          EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        }
 retry:
-        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+        handle = ext3_journal_start(dir, credits);
-                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2211,21 +2237,45 @@ retry:
        if (IS_ERR(inode))
                goto out_stop;
-        if (l > sizeof (EXT3_I(inode)->i_data)) {
+        if (l > EXT3_N_BLOCKS * 4) {
                inode->i_op = &ext3_symlink_inode_operations;
                ext3_set_aops(inode);
                /*
-                 * page_symlink() calls into ext3_prepare/commit_write.
+                 * We cannot call page_symlink() with transaction started
-                 * We have a transaction open.  All is sweetness.  It also sets
+                 * because it calls into ext3_write_begin() which acquires page
-                 * i_size in generic_commit_write().
+                 * lock which ranks below transaction start (and it can also
+                 * wait for journal commit if we are running out of space). So
+                 * we have to stop transaction now and restart it when symlink
+                 * contents is written. 
+                 *
+                 * To keep fs consistent in case of crash, we have to put inode
+                 * to orphan list in the mean time.
                 */
+                drop_nlink(inode);
+                err = ext3_orphan_add(handle, inode);
+                ext3_journal_stop(handle);
+                if (err)
+                        goto err_drop_inode;
                err = __page_symlink(inode, symname, l, 1);
+                if (err)
+                        goto err_drop_inode;
+                /*
+                 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+                 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                 */
+                handle = ext3_journal_start(dir,
+                                EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+                if (IS_ERR(handle)) {
+                        err = PTR_ERR(handle);
+                        goto err_drop_inode;
+                }
+                inc_nlink(inode);
+                err = ext3_orphan_del(handle, inode);
                if (err) {
+                        ext3_journal_stop(handle);
                        drop_nlink(inode);
-                        unlock_new_inode(inode);
+                        goto err_drop_inode;
-                        ext3_mark_inode_dirty(handle, inode);
-                        iput (inode);
-                        goto out_stop;
                }
        } else {
                inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
+err_drop_inode:
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
 }
 static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c6a9e0eadc1..aad153ef6b78 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,6 +36,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/log2.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
@@ -1367,6 +1368,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
        } else {
                ext3_msg(sb, KERN_INFO, "using internal journal");
        }
+        cleancache_init_fs(sb);
        return res;
 }
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+                mmp.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 /**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle:                     handle to this transaction
- * @sb:                         super block
- * @block:                      start physcial block to add to the block group
- * @count:                      number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                         ext4_fsblk_t block, unsigned long count)
-{
-        struct buffer_head *bitmap_bh = NULL;
-        struct buffer_head *gd_bh;
-        ext4_group_t block_group;
-        ext4_grpblk_t bit;
-        unsigned int i;
-        struct ext4_group_desc *desc;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        int err = 0, ret, blk_free_count;
-        ext4_grpblk_t blocks_freed;
-        struct ext4_group_info *grp;
-        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-        grp = ext4_get_group_info(sb, block_group);
-        /*
-         * Check to see if we are freeing blocks across a group
-         * boundary.
-         */
-        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-                goto error_return;
-        }
-        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-        if (!bitmap_bh)
-                goto error_return;
-        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-        if (!desc)
-                goto error_return;
-        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-            in_range(block + count - 1, ext4_inode_table(sb, desc),
-                     sbi->s_itb_per_group)) {
-                ext4_error(sb, "Adding blocks in system zones - "
-                           "Block = %llu, count = %lu",
-                           block, count);
-                goto error_return;
-        }
-        /*
-         * We are about to add blocks to the bitmap,
-         * so we need undo access.
-         */
-        BUFFER_TRACE(bitmap_bh, "getting undo access");
-        err = ext4_journal_get_undo_access(handle, bitmap_bh);
-        if (err)
-                goto error_return;
-        /*
-         * We are about to modify some metadata.  Call the journal APIs
-         * to unshare ->b_data if a currently-committing transaction is
-         * using it
-         */
-        BUFFER_TRACE(gd_bh, "get_write_access");
-        err = ext4_journal_get_write_access(handle, gd_bh);
-        if (err)
-                goto error_return;
-        /*
-         * make sure we don't allow a parallel init on other groups in the
-         * same buddy cache
-         */
-        down_write(&grp->alloc_sem);
-        for (i = 0, blocks_freed = 0; i < count; i++) {
-                BUFFER_TRACE(bitmap_bh, "clear bit");
-                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-                                                bit + i, bitmap_bh->b_data)) {
-                        ext4_error(sb, "bit already cleared for block %llu",
-                                   (ext4_fsblk_t)(block + i));
-                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
-                } else {
-                        blocks_freed++;
-                }
-        }
-        ext4_lock_group(sb, block_group);
-        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-        ext4_free_blks_set(sb, desc, blk_free_count);
-        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-        ext4_unlock_group(sb, block_group);
-        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-        if (sbi->s_log_groups_per_flex) {
-                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                atomic_add(blocks_freed,
-                           &sbi->s_flex_groups[flex_group].free_blocks);
-        }
-        /*
-         * request to reload the buddy with the
-         * new bitmap information
-         */
-        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-        grp->bb_free += blocks_freed;
-        up_write(&grp->alloc_sem);
-        /* We dirtied the bitmap block */
-        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-        /* And the group descriptor block */
-        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-        if (!err)
-                err = ret;
-error_return:
-        brelse(bitmap_bh);
-        ext4_std_error(sb, err);
-        return;
-}
-/**
 * ext4_has_free_blocks()
 * @sbi:        in-core super block structure.
 * @nblocks:    number of needed blocks
@@ -493,7 +369,8 @@ error_return:
 * Check if filesystem has nblocks free & available for allocation.
 * On success return 1, return 0 on failure.
 */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                s64 nblocks, unsigned int flags)
 {
        s64 free_blocks, dirty_blocks, root_blocks;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                                                EXT4_FREEBLOCKS_WATERMARK) {
                free_blocks  = percpu_counter_sum_positive(fbc);
                dirty_blocks = percpu_counter_sum_positive(dbc);
-                if (dirty_blocks < 0) {
-                        printk(KERN_CRIT "Dirty block accounting "
-                                        "went wrong %lld\n",
-                                        (long long)dirty_blocks);
-                }
        }
        /* Check whether we have space after
         * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
        /* Hm, nope.  Are (enough) root reserved blocks available? */
        if (sbi->s_resuid == current_fsuid() ||
            ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
-            capable(CAP_SYS_RESOURCE)) {
+            capable(CAP_SYS_RESOURCE) ||
+                (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
                if (free_blocks >= (nblocks + dirty_blocks))
                        return 1;
        }
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 }
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-                                                s64 nblocks)
+                           s64 nblocks, unsigned int flags)
 {
-        if (ext4_has_free_blocks(sbi, nblocks)) {
+        if (ext4_has_free_blocks(sbi, nblocks, flags)) {
                percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
                return 0;
        } else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+        if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
            (*retries)++ > 3 ||
            !EXT4_SB(sb)->s_journal)
                return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 * error stores in errp pointer
 */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t goal, unsigned long *count, int *errp)
+                                  ext4_fsblk_t goal, unsigned int flags,
+                                  unsigned long *count, int *errp)
 {
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ar.inode = inode;
        ar.goal = goal;
        ar.len = count ? *count : 1;
+        ar.flags = flags;
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_DELALLOC_RESERVED       0x0400
 /* We are doing stream allocation */
 #define EXT4_MB_STREAM_ALLOC            0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS         0x1000
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
 */
 #define EXT4_BAD_INO             1      /* Bad blocks inode */
 #define EXT4_ROOT_INO            2      /* Root inode */
+#define EXT4_USR_QUOTA_INO       3      /* User quota inode */
+#define EXT4_GRP_QUOTA_INO       4      /* Group quota inode */
 #define EXT4_BOOT_LOADER_INO     5      /* Boot loader inode */
 #define EXT4_UNDEL_DIR_INO       6      /* Undelete directory inode */
 #define EXT4_RESIZE_INO          7      /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
        /* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT          (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+        /* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT           0x0020
+        /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE            0x0040
 /*
 * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
        __le16  s_want_extra_isize;     /* New inodes should reserve # bytes */
        __le32  s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;          /* RAID stride */
-        __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
 #endif
+        /* ext4 extent cache stats */
+        unsigned long extent_cache_hits;
+        unsigned long extent_cache_misses;
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;
+        /* Kernel thread for multiple mount protection */
+        struct task_struct *s_mmp_tsk;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM         0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE      0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA            0x0100
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION       0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE          0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_EA_INODE          0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
+#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                         EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
+                                         EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 #define EXT4_FEATURE_COMPAT_SUPP        EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
-                                         EXT4_FEATURE_INCOMPAT_FLEX_BG)
+                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                         EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 */
 struct ext4_lazy_init {
        unsigned long           li_state;
-        wait_queue_head_t       li_wait_daemon;
-        wait_queue_head_t       li_wait_task;
-        struct timer_list       li_timer;
-        struct task_struct      *li_task;
        struct list_head        li_request_list;
        struct mutex            li_list_mtx;
 };
@@ -1615,6 +1639,67 @@ struct ext4_features {
 };
 /*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+struct mmp_struct {
+        __le32  mmp_magic;              /* Magic number for MMP */
+        __le32  mmp_seq;                /* Sequence no. updated periodically */
+        /*
+         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+         * purposes and do not affect the correctness of the algorithm
+         */
+        __le64  mmp_time;               /* Time last updated */
+        char    mmp_nodename[64];       /* Node which last updated MMP block */
+        char    mmp_bdevname[32];       /* Bdev which last updated MMP block */
+        /*
+         * mmp_check_interval is used to verify if the MMP block has been
+         * updated on the block device. The value is updated based on the
+         * maximum time to write the MMP block during an update cycle.
+         */
+        __le16  mmp_check_interval;
+        __le16  mmp_pad1;
+        __le32  mmp_pad2[227];
+};
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+        struct buffer_head *bh; /* bh from initial read_mmp_block() */
+        struct super_block *sb;  /* super block of the fs */
+};
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT             2UL
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL     5UL
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL     300UL
+/*
 * Function prototypes
 */
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, unsigned long *count, int *errp);
+                                         ext4_fsblk_t goal,
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
+                                         unsigned int flags,
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                                         unsigned long *count,
-                                ext4_fsblk_t block, unsigned long count);
+                                         int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+                                  s64 nblocks, unsigned int flags);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                                ext4_fsblk_t block, unsigned long count);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 /* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
                                                       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+                           const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)      __dump_mmp_msg(sb, mmp, __func__, \
+                                                       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
                                    struct super_block *, ext4_group_t, \
                                    unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+                                loff_t length);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               int len,
                               struct writeback_control *wbc);
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
        BH_Uninit       /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
 #include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                   handle_t *handle, struct buffer_head *bh)
-{
-        int err = 0;
-        if (ext4_handle_valid(handle)) {
-                err = jbd2_journal_get_undo_access(handle, bh);
-                if (err)
-                        ext4_journal_abort_handle(where, line, __func__, bh,
-                                                  handle, err);
-        }
-        return err;
-}
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh)
 {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
                               const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                   handle_t *handle, struct buffer_head *bh);
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh);
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 int __ext4_handle_dirty_super(const char *where, unsigned int line,
                              handle_t *handle, struct super_block *sb);
-#define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
 #include <trace/events/ext4.h>
+static int ext4_split_extent(handle_t *handle,
+                                struct inode *inode,
+                                struct ext4_ext_path *path,
+                                struct ext4_map_blocks *map,
+                                int split_flag,
+                                int flags);
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 static ext4_fsblk_t
 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
-                        struct ext4_extent *ex, int *err)
+                        struct ext4_extent *ex, int *err, unsigned int flags)
 {
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+                                        NULL, err);
        return newblock;
 }
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        }
        ext_debug("\n");
 }
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+                        ext4_fsblk_t newblock, int level)
+{
+        int depth = ext_depth(inode);
+        struct ext4_extent *ex;
+        if (depth != level) {
+                struct ext4_extent_idx *idx;
+                idx = path[level].p_idx;
+                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+                        ext_debug("%d: move %d:%llu in new index %llu\n", level,
+                                        le32_to_cpu(idx->ei_block),
+                                        ext4_idx_pblock(idx),
+                                        newblock);
+                        idx++;
+                }
+                return;
+        }
+        ex = path[depth].p_ext;
+        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+                                le32_to_cpu(ex->ee_block),
+                                ext4_ext_pblock(ex),
+                                ext4_ext_is_uninitialized(ex),
+                                ext4_ext_get_actual_len(ex),
+                                newblock);
+                ex++;
+        }
+}
 #else
 #define ext4_ext_show_path(inode, path)
 #define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
 #endif
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 * - initializes subtree
 */
 static int ext4_ext_split(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *path,
+                          unsigned int flags,
-                                struct ext4_extent *newext, int at)
+                          struct ext4_ext_path *path,
+                          struct ext4_extent *newext, int at)
 {
        struct buffer_head *bh = NULL;
        int depth = ext_depth(inode);
        struct ext4_extent_header *neh;
        struct ext4_extent_idx *fidx;
-        struct ext4_extent *ex;
        int i = at, k, m, a;
        ext4_fsblk_t newblock, oldblock;
        __le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
                newblock = ext4_ext_new_meta_block(handle, inode, path,
-                                                   newext, &err);
+                                                   newext, &err, flags);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_depth = 0;
-        ex = EXT_FIRST_EXTENT(neh);
        /* move remainder of path[depth] to the new leaf */
        if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                goto cleanup;
        }
        /* start copy from next extent */
-        /* TODO: we could do it by single memmove */
+        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
-        m = 0;
+        ext4_ext_show_move(inode, path, newblock, depth);
-        path[depth].p_ext++;
-        while (path[depth].p_ext <=
-                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
-                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
-                                le32_to_cpu(path[depth].p_ext->ee_block),
-                                ext4_ext_pblock(path[depth].p_ext),
-                                ext4_ext_is_uninitialized(path[depth].p_ext),
-                                ext4_ext_get_actual_len(path[depth].p_ext),
-                                newblock);
-                /*memmove(ex++, path[depth].p_ext++,
-                                sizeof(struct ext4_extent));
-                neh->eh_entries++;*/
-                path[depth].p_ext++;
-                m++;
-        }
        if (m) {
-                memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+                struct ext4_extent *ex;
+                ex = EXT_FIRST_EXTENT(neh);
+                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                le16_add_cpu(&neh->eh_entries, m);
        }
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
                                i, newblock, le32_to_cpu(border), oldblock);
-                /* copy indexes */
-                m = 0;
-                path[i].p_idx++;
-                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+                /* move remainder of path[i] to the new index block */
-                                EXT_MAX_INDEX(path[i].p_hdr));
                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                        EXT_LAST_INDEX(path[i].p_hdr))) {
                        EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                        err = -EIO;
                        goto cleanup;
                }
-                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
+                /* start copy indexes */
-                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
+                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
-                                        le32_to_cpu(path[i].p_idx->ei_block),
+                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
-                                        ext4_idx_pblock(path[i].p_idx),
+                                EXT_MAX_INDEX(path[i].p_hdr));
-                                        newblock);
+                ext4_ext_show_move(inode, path, newblock, i);
-                        /*memmove(++fidx, path[i].p_idx++,
-                                        sizeof(struct ext4_extent_idx));
-                        neh->eh_entries++;
-                        BUG_ON(neh->eh_entries > neh->eh_max);*/
-                        path[i].p_idx++;
-                        m++;
-                }
                if (m) {
-                        memmove(++fidx, path[i].p_idx - m,
+                        memmove(++fidx, path[i].p_idx,
                                sizeof(struct ext4_extent_idx) * m);
                        le16_add_cpu(&neh->eh_entries, m);
                }
@@ -1056,8 +1073,9 @@ cleanup:
 *   just created block
 */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-                                        struct ext4_ext_path *path,
+                                 unsigned int flags,
-                                        struct ext4_extent *newext)
+                                 struct ext4_ext_path *path,
+                                 struct ext4_extent *newext)
 {
        struct ext4_ext_path *curp = path;
        struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock;
        int err = 0;
-        newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+        newblock = ext4_ext_new_meta_block(handle, inode, path,
+                newext, &err, flags);
        if (newblock == 0)
                return err;
@@ -1140,8 +1159,9 @@ out:
 * if no free index is found, then it requests in-depth growing.
 */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-                                        struct ext4_ext_path *path,
+                                    unsigned int flags,
-                                        struct ext4_extent *newext)
+                                    struct ext4_ext_path *path,
+                                    struct ext4_extent *newext)
 {
        struct ext4_ext_path *curp;
        int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
        if (EXT_HAS_FREE_INDEX(curp)) {
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
-                err = ext4_ext_split(handle, inode, path, newext, i);
+                err = ext4_ext_split(handle, inode, flags, path, newext, i);
                if (err)
                        goto out;
@@ -1174,7 +1194,8 @@ repeat:
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
-                err = ext4_ext_grow_indepth(handle, inode, path, newext);
+                err = ext4_ext_grow_indepth(handle, inode, flags,
+                                            path, newext);
                if (err)
                        goto out;
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
-static int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *ex)
 {
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
 }
 /*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+                                  struct ext4_ext_path *path,
+                                  struct ext4_extent *ex) {
+        struct ext4_extent_header *eh;
+        unsigned int depth;
+        int merge_done = 0;
+        int ret = 0;
+        depth = ext_depth(inode);
+        BUG_ON(path[depth].p_hdr == NULL);
+        eh = path[depth].p_hdr;
+        if (ex > EXT_FIRST_EXTENT(eh))
+                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+        if (!merge_done)
+                ret = ext4_ext_try_to_merge_right(inode, path, ex);
+        return ret;
+}
+/*
 * check if a portion of the "newext" extent overlaps with an
 * existing extent.
 *
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        int depth, len, err;
        ext4_lblk_t next;
        unsigned uninitialized = 0;
+        int flags = 0;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
-        err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+        if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+                flags = EXT4_MB_USE_ROOT_BLOCKS;
+        err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
        if (err)
                goto cleanup;
        depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 }
 /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer.  If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
 * Return 0 if cache is invalid; 1 if the cache is valid
 */
-static int
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+        struct ext4_ext_cache *ex){
-                        struct ext4_extent *ex)
-{
        struct ext4_ext_cache *cex;
+        struct ext4_sb_info *sbi;
        int ret = 0;
        /*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
+        sbi = EXT4_SB(inode->i_sb);
        /* has cache valid data? */
        if (cex->ec_len == 0)
                goto errout;
        if (in_range(block, cex->ec_block, cex->ec_len)) {
-                ex->ee_block = cpu_to_le32(cex->ec_block);
+                memcpy(ex, cex, sizeof(struct ext4_ext_cache));
-                ext4_ext_store_pblock(ex, cex->ec_start);
-                ex->ee_len = cpu_to_le16(cex->ec_len);
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
                ret = 1;
        }
 errout:
+        if (!ret)
+                sbi->extent_cache_misses++;
+        else
+                sbi->extent_cache_hits++;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return ret;
 }
 /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+                        struct ext4_extent *ex)
+{
+        struct ext4_ext_cache cex;
+        int ret = 0;
+        if (ext4_ext_check_cache(inode, block, &cex)) {
+                ex->ee_block = cpu_to_le32(cex.ec_block);
+                ext4_ext_store_pblock(ex, cex.ec_start);
+                ex->ee_len = cpu_to_le16(cex.ec_len);
+                ret = 1;
+        }
+        return ret;
+}
+/*
 * ext4_ext_rm_idx:
 * removes index from the index block.
 * It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                ext4_free_blocks(handle, inode, NULL, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
+                /* head removal */
-                        from, to, le32_to_cpu(ex->ee_block), ee_len);
+                ext4_lblk_t num;
+                ext4_fsblk_t start;
+                num = to - from;
+                start = ext4_ext_pblock(ex);
+                ext_debug("free first %u blocks starting %llu\n", num, start);
+                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else {
                printk(KERN_INFO "strange request: removal(2) "
                                "%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
        return 0;
 }
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                struct ext4_ext_path *path, ext4_lblk_t start)
+                struct ext4_ext_path *path, ext4_lblk_t start,
+                ext4_lblk_t end)
 {
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
+        struct ext4_map_blocks map;
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                path[depth].p_ext = ex;
                a = ex_ee_block > start ? ex_ee_block : start;
-                b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
+                b = ex_ee_block+ex_ee_len - 1 < end ?
-                        ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+                        ex_ee_block+ex_ee_len - 1 : end;
                ext_debug("  border %u:%u\n", a, b);
-                if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
+                /* If this extent is beyond the end of the hole, skip it */
-                        block = 0;
+                if (end <= ex_ee_block) {
-                        num = 0;
+                        ex--;
-                        BUG();
+                        ex_ee_block = le32_to_cpu(ex->ee_block);
+                        ex_ee_len = ext4_ext_get_actual_len(ex);
+                        continue;
+                } else if (a != ex_ee_block &&
+                        b != ex_ee_block + ex_ee_len - 1) {
+                        /*
+                         * If this is a truncate, then this condition should
+                         * never happen because at least one of the end points
+                         * needs to be on the edge of the extent.
+                         */
+                        if (end == EXT_MAX_BLOCK) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                                start, end);
+                                block = 0;
+                                num = 0;
+                                err = -EIO;
+                                goto out;
+                        }
+                        /*
+                         * else this is a hole punch, so the extent needs to
+                         * be split since neither edge of the hole is on the
+                         * extent edge
+                         */
+                        else{
+                                map.m_pblk = ext4_ext_pblock(ex);
+                                map.m_lblk = ex_ee_block;
+                                map.m_len = b - ex_ee_block;
+                                err = ext4_split_extent(handle,
+                                        inode, path, &map, 0,
+                                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                                        EXT4_GET_BLOCKS_PRE_IO);
+                                if (err < 0)
+                                        goto out;
+                                ex_ee_len = ext4_ext_get_actual_len(ex);
+                                b = ex_ee_block+ex_ee_len - 1 < end ?
+                                        ex_ee_block+ex_ee_len - 1 : end;
+                                /* Then remove tail of this extent */
+                                block = ex_ee_block;
+                                num = a - block;
+                        }
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
                        block = ex_ee_block;
                        num = a - block;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
                        /* remove head of the extent */
-                        block = a;
+                        block = b;
-                        num = b - a;
+                        num =  ex_ee_block + ex_ee_len - b;
-                        /* there is no "make a hole" API yet */
-                        BUG();
+                        /*
+                         * If this is a truncate, this condition
+                         * should never happen
+                         */
+                        if (end == EXT_MAX_BLOCK) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
                } else {
                        /* remove whole extent: excellent! */
                        block = ex_ee_block;
                        num = 0;
-                        BUG_ON(a != ex_ee_block);
+                        if (a != ex_ee_block) {
-                        BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
+                        if (b != ex_ee_block + ex_ee_len - 1) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
                }
                /*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (num == 0) {
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);
-                        le16_add_cpu(&eh->eh_entries, -1);
+                } else if (block != ex_ee_block) {
+                        /*
+                         * If this was a head removal, then we need to update
+                         * the physical block since it is now at a different
+                         * location
+                         */
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
                }
                ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (err)
                        goto out;
+                /*
+                 * If the extent was completely released,
+                 * we need to remove it from the leaf
+                 */
+                if (num == 0) {
+                        if (end != EXT_MAX_BLOCK) {
+                                /*
+                                 * For hole punching, we need to scoot all the
+                                 * extents up when an extent is removed so that
+                                 * we dont have blank extents in the middle
+                                 */
+                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+                                        sizeof(struct ext4_extent));
+                                /* Now get rid of the one at the end */
+                                memset(EXT_LAST_EXTENT(eh), 0,
+                                        sizeof(struct ext4_extent));
+                        }
+                        le16_add_cpu(&eh->eh_entries, -1);
+                }
                ext_debug("new extent: %u:%u:%llu\n", block, num,
                                ext4_ext_pblock(ex));
                ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                                ext4_lblk_t end)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
        while (i >= 0 && err == 0) {
                if (i == depth) {
                        /* this is leaf block */
-                        err = ext4_ext_rm_leaf(handle, inode, path, start);
+                        err = ext4_ext_rm_leaf(handle, inode, path,
+                                        start, end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
        return ret;
 }
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
+                                        due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *               the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+                             struct inode *inode,
+                             struct ext4_ext_path *path,
+                             ext4_lblk_t split,
+                             int split_flag,
+                             int flags)
+{
+        ext4_fsblk_t newblock;
+        ext4_lblk_t ee_block;
+        struct ext4_extent *ex, newex, orig_ex;
+        struct ext4_extent *ex2 = NULL;
+        unsigned int ee_len, depth;
+        int err = 0;
+        ext_debug("ext4_split_extents_at: inode %lu, logical"
+                "block %llu\n", inode->i_ino, (unsigned long long)split);
+        ext4_ext_show_leaf(inode, path);
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        newblock = split - ee_block + ext4_ext_pblock(ex);
+        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                goto out;
+        if (split == ee_block) {
+                /*
+                 * case b: block @split is the block that the extent begins with
+                 * then we just change the state of the extent, and splitting
+                 * is not needed.
+                 */
+                if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                        ext4_ext_mark_uninitialized(ex);
+                else
+                        ext4_ext_mark_initialized(ex);
+                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+                        ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                goto out;
+        }
+        /* case a */
+        memcpy(&orig_ex, ex, sizeof(orig_ex));
+        ex->ee_len = cpu_to_le16(split - ee_block);
+        if (split_flag & EXT4_EXT_MARK_UNINIT1)
+                ext4_ext_mark_uninitialized(ex);
+        /*
+         * path may lead to new leaf, not to original leaf any more
+         * after ext4_ext_insert_extent() returns,
+         */
+        err = ext4_ext_dirty(handle, inode, path + depth);
+        if (err)
+                goto fix_extent_len;
+        ex2 = &newex;
+        ex2->ee_block = cpu_to_le32(split);
+        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
+        ext4_ext_store_pblock(ex2, newblock);
+        if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                ext4_ext_mark_uninitialized(ex2);
+        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                err = ext4_ext_zeroout(inode, &orig_ex);
+                if (err)
+                        goto fix_extent_len;
+                /* update the extent length and mark as initialized */
+                ex->ee_len = cpu_to_le32(ee_len);
+                ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                goto out;
+        } else if (err)
+                goto fix_extent_len;
+out:
+        ext4_ext_show_leaf(inode, path);
+        return err;
+fix_extent_len:
+        ex->ee_len = orig_ex.ee_len;
+        ext4_ext_dirty(handle, inode, path + depth);
+        return err;
+}
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+                              struct inode *inode,
+                              struct ext4_ext_path *path,
+                              struct ext4_map_blocks *map,
+                              int split_flag,
+                              int flags)
+{
+        ext4_lblk_t ee_block;
+        struct ext4_extent *ex;
+        unsigned int ee_len, depth;
+        int err = 0;
+        int uninitialized;
+        int split_flag1, flags1;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        uninitialized = ext4_ext_is_uninitialized(ex);
+        if (map->m_lblk + map->m_len < ee_block + ee_len) {
+                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+                if (uninitialized)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+                                       EXT4_EXT_MARK_UNINIT2;
+                err = ext4_split_extent_at(handle, inode, path,
+                                map->m_lblk + map->m_len, split_flag1, flags1);
+                if (err)
+                        goto out;
+        }
+        ext4_ext_drop_refs(path);
+        path = ext4_ext_find_extent(inode, map->m_lblk, path);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        if (map->m_lblk >= ee_block) {
+                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                if (uninitialized)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+                if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+                err = ext4_split_extent_at(handle, inode, path,
+                                map->m_lblk, split_flag1, flags);
+                if (err)
+                        goto out;
+        }
+        ext4_ext_show_leaf(inode, path);
+out:
+        return err ? err : map->m_len;
+}
 #define EXT4_EXT_ZERO_LEN 7
 /*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path *path)
 {
-        struct ext4_extent *ex, newex, orig_ex;
+        struct ext4_map_blocks split_map;
-        struct ext4_extent *ex1 = NULL;
+        struct ext4_extent zero_ex;
-        struct ext4_extent *ex2 = NULL;
+        struct ext4_extent *ex;
-        struct ext4_extent *ex3 = NULL;
-        struct ext4_extent_header *eh;
        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
-        ext4_fsblk_t newblock;
        int err = 0;
-        int ret = 0;
+        int split_flag = 0;
-        int may_zeroout;
        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-        ex2 = ex;
-        orig_ex.ee_block = ex->ee_block;
-        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
+        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully insde i_size or new_size.
         */
-        may_zeroout = ee_block + ee_len <= eof_block;
+        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-        if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
+        if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-                err =  ext4_ext_zeroout(inode, &orig_ex);
+            (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                err = ext4_ext_zeroout(inode, ex);
                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zeroed the full extent */
-                return allocated;
-        }
-        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (map->m_lblk > ee_block) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * for sanity, update the length of the ex2 extent before
-         * we insert ex3, if ex1 is NULL. This is to avoid temporary
-         * overlap of blocks.
-         */
-        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(map->m_len);
-        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > map->m_len) {
-                unsigned int newdepth;
-                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-                if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
-                        /*
-                         * map->m_lblk == ee_block is handled by the zerouout
-                         * at the beginning.
-                         * Mark first half uninitialized.
-                         * Mark second half initialized and zero out the
-                         * initialized extent
-                         */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
-                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        ex3 = &newex;
-                        ex3->ee_block = cpu_to_le32(map->m_lblk);
-                        ext4_ext_store_pblock(ex3, newblock);
-                        ex3->ee_len = cpu_to_le16(allocated);
-                        err = ext4_ext_insert_extent(handle, inode, path,
-                                                        ex3, 0);
-                        if (err == -ENOSPC) {
-                                err =  ext4_ext_zeroout(inode, &orig_ex);
-                                if (err)
-                                        goto fix_extent_len;
-                                ex->ee_block = orig_ex.ee_block;
-                                ex->ee_len   = orig_ex.ee_len;
-                                ext4_ext_store_pblock(ex,
-                                        ext4_ext_pblock(&orig_ex));
-                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* blocks available from map->m_lblk */
-                                return allocated;
-                        } else if (err)
-                                goto fix_extent_len;
-                        /*
-                         * We need to zero out the second half because
-                         * an fallocate request can update file size and
-                         * converting the second half to initialized extent
-                         * implies that we can leak some junk data to user
-                         * space.
-                         */
-                        err =  ext4_ext_zeroout(inode, ex3);
-                        if (err) {
-                                /*
-                                 * We should actually mark the
-                                 * second half as uninit and return error
-                                 * Insert would have changed the extent
-                                 */
-                                depth = ext_depth(inode);
-                                ext4_ext_drop_refs(path);
-                                path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                            path);
-                                if (IS_ERR(path)) {
-                                        err = PTR_ERR(path);
-                                        return err;
-                                }
-                                /* get the second half extent details */
-                                ex = path[depth].p_ext;
-                                err = ext4_ext_get_access(handle, inode,
-                                                                path + depth);
-                                if (err)
-                                        return err;
-                                ext4_ext_mark_uninitialized(ex);
-                                ext4_ext_dirty(handle, inode, path + depth);
-                                return err;
-                        }
-                        /* zeroed the second half */
-                        return allocated;
-                }
-                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-                ext4_ext_mark_uninitialized(ex3);
-                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-                if (err == -ENOSPC && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zeroed the full extent */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                } else if (err)
-                        goto fix_extent_len;
-                /*
-                 * The depth, and hence eh & ex might change
-                 * as part of the insert above.
-                 */
-                newdepth = ext_depth(inode);
-                /*
-                 * update the extent length after successful insert of the
-                 * split extent
-                 */
-                ee_len -= ext4_ext_get_actual_len(ex3);
-                orig_ex.ee_len = cpu_to_le16(ee_len);
-                may_zeroout = ee_block + ee_len <= eof_block;
-                depth = newdepth;
-                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
                        goto out;
-                }
-                eh = path[depth].p_hdr;
-                ex = path[depth].p_ext;
-                if (ex2 != &newex)
-                        ex2 = ex;
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;
+                ext4_ext_mark_initialized(ex);
-                allocated = map->m_len;
+                ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
-                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
+                goto out;
-                 * to insert a extent in the middle zerout directly
-                 * otherwise give the extent a chance to merge to left
-                 */
-                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                        map->m_lblk != ee_block && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zero out the first half */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                }
-        }
-        /*
-         * If there was a change of depth as part of the
-         * insertion of ex3 above, we need to update the length
-         * of the ex1 extent again here
-         */
-        if (ex1 && ex1 != ex) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-        ex2->ee_block = cpu_to_le32(map->m_lblk);
-        ext4_ext_store_pblock(ex2, newblock);
-        ex2->ee_len = cpu_to_le16(allocated);
-        if (ex2 != ex)
-                goto insert;
-        /*
-         * New (initialized) extent starts from the first block
-         * in the current extent. i.e., ex2 == ex
-         * We have to see if it can be merged with the extent
-         * on the left.
-         */
-        if (ex2 > EXT_FIRST_EXTENT(eh)) {
-                /*
-                 * To merge left, pass "ex2 - 1" to try_to_merge(),
-                 * since it merges towards right _only_.
-                 */
-                ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
-                        if (err)
-                                goto out;
-                        depth = ext_depth(inode);
-                        ex2--;
-                }
        }
        /*
-         * Try to Merge towards right. This might be required
+         * four cases:
-         * only when the whole extent is being written to.
+         * 1. split the extent into three extents.
-         * i.e. ex2 == ex and ex3 == NULL.
+         * 2. split the extent into two extents, zeroout the first half.
+         * 3. split the extent into two extents, zeroout the second half.
+         * 4. split the extent into two extents with out zeroout.
         */
-        if (!ex3) {
+        split_map.m_lblk = map->m_lblk;
-                ret = ext4_ext_try_to_merge(inode, path, ex2);
+        split_map.m_len = map->m_len;
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
+        if (allocated > map->m_len) {
+                if (allocated <= EXT4_EXT_ZERO_LEN &&
+                    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                        /* case 3 */
+                        zero_ex.ee_block =
+                                         cpu_to_le32(map->m_lblk);
+                        zero_ex.ee_len = cpu_to_le16(allocated);
+                        ext4_ext_store_pblock(&zero_ex,
+                                ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+                        err = ext4_ext_zeroout(inode, &zero_ex);
                        if (err)
                                goto out;
+                        split_map.m_lblk = map->m_lblk;
+                        split_map.m_len = allocated;
+                } else if ((map->m_lblk - ee_block + map->m_len <
+                           EXT4_EXT_ZERO_LEN) &&
+                           (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                        /* case 2 */
+                        if (map->m_lblk != ee_block) {
+                                zero_ex.ee_block = ex->ee_block;
+                                zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+                                                        ee_block);
+                                ext4_ext_store_pblock(&zero_ex,
+                                                      ext4_ext_pblock(ex));
+                                err = ext4_ext_zeroout(inode, &zero_ex);
+                                if (err)
+                                        goto out;
+                        }
+                        split_map.m_lblk = ee_block;
+                        split_map.m_len = map->m_lblk - ee_block + map->m_len;
+                        allocated = map->m_len;
                }
        }
-        /* Mark modified extent as dirty */
-        err = ext4_ext_dirty(handle, inode, path + depth);
+        allocated = ext4_split_extent(handle, inode, path,
-        goto out;
+                                       &split_map, split_flag, 0);
-insert:
+        if (allocated < 0)
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+                err = allocated;
-        if (err == -ENOSPC && may_zeroout) {
-                err =  ext4_ext_zeroout(inode, &orig_ex);
-                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zero out the first half */
-                return allocated;
-        } else if (err)
-                goto fix_extent_len;
 out:
-        ext4_ext_show_leaf(inode, path);
        return err ? err : allocated;
-fix_extent_len:
-        ex->ee_block = orig_ex.ee_block;
-        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-        ext4_ext_mark_uninitialized(ex);
-        ext4_ext_dirty(handle, inode, path + depth);
-        return err;
 }
 /*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                                        struct ext4_ext_path *path,
                                        int flags)
 {
-        struct ext4_extent *ex, newex, orig_ex;
+        ext4_lblk_t eof_block;
-        struct ext4_extent *ex1 = NULL;
+        ext4_lblk_t ee_block;
-        struct ext4_extent *ex2 = NULL;
+        struct ext4_extent *ex;
-        struct ext4_extent *ex3 = NULL;
+        unsigned int ee_len;
-        ext4_lblk_t ee_block, eof_block;
+        int split_flag = 0, depth;
-        unsigned int allocated, ee_len, depth;
-        ext4_fsblk_t newblock;
-        int err = 0;
-        int may_zeroout;
        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map->m_len)
                eof_block = map->m_lblk + map->m_len;
-        depth = ext_depth(inode);
-        ex = path[depth].p_ext;
-        ee_block = le32_to_cpu(ex->ee_block);
-        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-        ex2 = ex;
-        orig_ex.ee_block = ex->ee_block;
-        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully insde i_size or new_size.
         */
-        may_zeroout = ee_block + ee_len <= eof_block;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
-        /*
+        ee_block = le32_to_cpu(ex->ee_block);
-         * If the uninitialized extent begins at the same logical
+        ee_len = ext4_ext_get_actual_len(ex);
-         * block where the write begins, and the write completely
-         * covers the extent, then we don't need to split it.
-         */
-        if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
-                return allocated;
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
-        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (map->m_lblk > ee_block) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * for sanity, update the length of the ex2 extent before
-         * we insert ex3, if ex1 is NULL. This is to avoid temporary
-         * overlap of blocks.
-         */
-        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(map->m_len);
-        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > map->m_len) {
-                unsigned int newdepth;
-                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-                ext4_ext_mark_uninitialized(ex3);
-                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-                if (err == -ENOSPC && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zeroed the full extent */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                } else if (err)
-                        goto fix_extent_len;
-                /*
-                 * The depth, and hence eh & ex might change
-                 * as part of the insert above.
-                 */
-                newdepth = ext_depth(inode);
-                /*
-                 * update the extent length after successful insert of the
-                 * split extent
-                 */
-                ee_len -= ext4_ext_get_actual_len(ex3);
-                orig_ex.ee_len = cpu_to_le16(ee_len);
-                may_zeroout = ee_block + ee_len <= eof_block;
-                depth = newdepth;
-                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
-                        goto out;
-                }
-                ex = path[depth].p_ext;
-                if (ex2 != &newex)
-                        ex2 = ex;
-                err = ext4_ext_get_access(handle, inode, path + depth);
+        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-                if (err)
+        split_flag |= EXT4_EXT_MARK_UNINIT2;
-                        goto out;
-                allocated = map->m_len;
+        flags |= EXT4_GET_BLOCKS_PRE_IO;
-        }
+        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
-        /*
-         * If there was a change of depth as part of the
-         * insertion of ex3 above, we need to update the length
-         * of the ex1 extent again here
-         */
-        if (ex1 && ex1 != ex) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-         * using direct I/O, uninitialised still.
-         */
-        ex2->ee_block = cpu_to_le32(map->m_lblk);
-        ext4_ext_store_pblock(ex2, newblock);
-        ex2->ee_len = cpu_to_le16(allocated);
-        ext4_ext_mark_uninitialized(ex2);
-        if (ex2 != ex)
-                goto insert;
-        /* Mark modified extent as dirty */
-        err = ext4_ext_dirty(handle, inode, path + depth);
-        ext_debug("out here\n");
-        goto out;
-insert:
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-        if (err == -ENOSPC && may_zeroout) {
-                err =  ext4_ext_zeroout(inode, &orig_ex);
-                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zero out the first half */
-                return allocated;
-        } else if (err)
-                goto fix_extent_len;
-out:
-        ext4_ext_show_leaf(inode, path);
-        return err ? err : allocated;
-fix_extent_len:
-        ex->ee_block = orig_ex.ee_block;
-        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-        ext4_ext_mark_uninitialized(ex);
-        ext4_ext_dirty(handle, inode, path + depth);
-        return err;
 }
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                              struct inode *inode,
                                              struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
        struct ext4_extent_header *eh;
        int depth;
        int err = 0;
-        int ret = 0;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
+        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)le32_to_cpu(ex->ee_block),
+                ext4_ext_get_actual_len(ex));
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* first mark the extent as initialized */
        ext4_ext_mark_initialized(ex);
-        /*
+        /* note: ext4_ext_correct_indexes() isn't needed here because
-         * We have to see if it can be merged with the extent
+         * borders are not changed
-         * on the left.
-         */
-        if (ex > EXT_FIRST_EXTENT(eh)) {
-                /*
-                 * To merge left, pass "ex - 1" to try_to_merge(),
-                 * since it merges towards right _only_.
-                 */
-                ret = ext4_ext_try_to_merge(inode, path, ex - 1);
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
-                        if (err)
-                                goto out;
-                        depth = ext_depth(inode);
-                        ex--;
-                }
-        }
-        /*
-         * Try to Merge towards right.
         */
-        ret = ext4_ext_try_to_merge(inode, path, ex);
+        ext4_ext_try_to_merge(inode, path, ex);
-        if (ret) {
-                err = ext4_ext_correct_indexes(handle, inode, path);
-                if (err)
-                        goto out;
-                depth = ext_depth(inode);
-        }
        /* Mark modified extent as dirty */
        err = ext4_ext_dirty(handle, inode, path + depth);
 out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock = 0;
        int err = 0, depth, ret;
        unsigned int allocated = 0;
+        unsigned int punched_out = 0;
+        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+        struct ext4_map_blocks punch_map;
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
-        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+                ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
-                        /* Do not put uninitialized extent in the cache */
+                        if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
-                        if (!ext4_ext_is_uninitialized(ex)) {
+                                /*
-                                ext4_ext_put_in_cache(inode, ee_block,
+                                 * Do not put uninitialized extent
-                                                        ee_len, ee_start);
+                                 * in the cache
-                                goto out;
+                                 */
+                                if (!ext4_ext_is_uninitialized(ex)) {
+                                        ext4_ext_put_in_cache(inode, ee_block,
+                                                ee_len, ee_start);
+                                        goto out;
+                                }
+                                ret = ext4_ext_handle_uninitialized_extents(
+                                        handle, inode, map, path, flags,
+                                        allocated, newblock);
+                                return ret;
                        }
-                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                        inode, map, path, flags, allocated,
+                        /*
-                                        newblock);
+                         * Punch out the map length, but only to the
-                        return ret;
+                         * end of the extent
+                         */
+                        punched_out = allocated < map->m_len ?
+                                allocated : map->m_len;
+                        /*
+                         * Sense extents need to be converted to
+                         * uninitialized, they must fit in an
+                         * uninitialized extent
+                         */
+                        if (punched_out > EXT_UNINIT_MAX_LEN)
+                                punched_out = EXT_UNINIT_MAX_LEN;
+                        punch_map.m_lblk = map->m_lblk;
+                        punch_map.m_pblk = newblock;
+                        punch_map.m_len = punched_out;
+                        punch_map.m_flags = 0;
+                        /* Check to see if the extent needs to be split */
+                        if (punch_map.m_len != ee_len ||
+                                punch_map.m_lblk != ee_block) {
+                                ret = ext4_split_extent(handle, inode,
+                                path, &punch_map, 0,
+                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                                EXT4_GET_BLOCKS_PRE_IO);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out2;
+                                }
+                                /*
+                                 * find extent for the block at
+                                 * the start of the hole
+                                 */
+                                ext4_ext_drop_refs(path);
+                                kfree(path);
+                                path = ext4_ext_find_extent(inode,
+                                map->m_lblk, NULL);
+                                if (IS_ERR(path)) {
+                                        err = PTR_ERR(path);
+                                        path = NULL;
+                                        goto out2;
+                                }
+                                depth = ext_depth(inode);
+                                ex = path[depth].p_ext;
+                                ee_len = ext4_ext_get_actual_len(ex);
+                                ee_block = le32_to_cpu(ex->ee_block);
+                                ee_start = ext4_ext_pblock(ex);
+                        }
+                        ext4_ext_mark_uninitialized(ex);
+                        err = ext4_ext_remove_space(inode, map->m_lblk,
+                                map->m_lblk + punched_out);
+                        goto out2;
                }
        }
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        else
                /* disable in-core preallocation for non-regular files */
                ar.flags = 0;
+        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
@@ -3529,7 +3647,11 @@ out2:
        }
        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
                newblock, map->m_len, err ? err : allocated);
-        return err ? err : allocated;
+        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+                        punched_out : allocated;
+        return err ? err : result;
 }
 void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
-        err = ext4_ext_remove_space(inode, last_block);
+        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
        /* In a multi-transaction truncate, we only make the final
         * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
-out_stop:
        up_write(&EXT4_I(inode)->i_data_sem);
+out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
-        /* We only support the FALLOC_FL_KEEP_SIZE mode */
-        if (mode & ~FALLOC_FL_KEEP_SIZE)
-                return -EOPNOTSUPP;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
+        /* Return error if mode is not supported */
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                return ext4_punch_hole(file, offset, len);
        trace_ext4_fallocate_enter(inode, offset, len, mode);
        map.m_lblk = offset >> blkbits;
        /*
@@ -3691,7 +3817,8 @@ retry:
                        break;
                }
                ret = ext4_map_blocks(handle, inode, &map,
-                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
+                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+                                      EXT4_GET_BLOCKS_NO_NORMALIZE);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                pgoff_t         last_offset;
                pgoff_t         offset;
                pgoff_t         index;
+                pgoff_t         start_index = 0;
                struct page     **pages = NULL;
                struct buffer_head *bh = NULL;
                struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
                                kfree(pages);
                                return EXT_CONTINUE;
                        }
+                        index = 0;
+next_page:
                        /* Try to find the 1st mapped buffer. */
-                        end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+                        end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
                                  blksize_bits;
-                        if (!page_has_buffers(pages[0]))
+                        if (!page_has_buffers(pages[index]))
                                goto out;
-                        head = page_buffers(pages[0]);
+                        head = page_buffers(pages[index]);
                        if (!head)
                                goto out;
+                        index++;
                        bh = head;
                        do {
-                                if (buffer_mapped(bh)) {
+                                if (end >= newex->ec_block +
+                                        newex->ec_len)
+                                        /* The buffer is out of
+                                         * the request range.
+                                         */
+                                        goto out;
+                                if (buffer_mapped(bh) &&
+                                    end >= newex->ec_block) {
+                                        start_index = index - 1;
                                        /* get the 1st mapped buffer. */
-                                        if (end > newex->ec_block +
-                                                newex->ec_len)
-                                                /* The buffer is out of
-                                                 * the request range.
-                                                 */
-                                                goto out;
                                        goto found_mapped_buffer;
                                }
                                bh = bh->b_this_page;
                                end++;
                        } while (bh != head);
-                        /* No mapped buffer found. */
+                        /* No mapped buffer in the range found in this page,
-                        goto out;
+                         * We need to look up next page.
+                         */
+                        if (index >= ret) {
+                                /* There is no page left, but we need to limit
+                                 * newex->ec_len.
+                                 */
+                                newex->ec_len = end - newex->ec_block;
+                                goto out;
+                        }
+                        goto next_page;
                } else {
                        /*Find contiguous delayed buffers. */
                        if (ret > 0 && pages[0]->index == last_offset)
                                head = page_buffers(pages[0]);
                        bh = head;
+                        index = 1;
+                        start_index = 0;
                }
 found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
                                end++;
                        } while (bh != head);
-                        for (index = 1; index < ret; index++) {
+                        for (; index < ret; index++) {
                                if (!page_has_buffers(pages[index])) {
                                        bh = NULL;
                                        break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
                                        bh = NULL;
                                        break;
                                }
                                if (pages[index]->index !=
-                                        pages[0]->index + index) {
+                                    pages[start_index]->index + index
+                                    - start_index) {
                                        /* Blocks are not contiguous. */
                                        bh = NULL;
                                        break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
        return (error < 0 ? error : 0);
 }
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode:  The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct ext4_ext_cache cache_ex;
+        ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+        struct address_space *mapping = inode->i_mapping;
+        struct ext4_map_blocks map;
+        handle_t *handle;
+        loff_t first_block_offset, last_block_offset, block_len;
+        loff_t first_page, last_page, first_page_offset, last_page_offset;
+        int ret, credits, blocks_released, err = 0;
+        first_block = (offset + sb->s_blocksize - 1) >>
+                EXT4_BLOCK_SIZE_BITS(sb);
+        last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+        first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+        last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+        first_page_offset = first_page << PAGE_CACHE_SHIFT;
+        last_page_offset = last_page << PAGE_CACHE_SHIFT;
+        /*
+         * Write out all dirty pages to avoid race conditions
+         * Then release them.
+         */
+        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+                err = filemap_write_and_wait_range(mapping,
+                        first_page_offset == 0 ? 0 : first_page_offset-1,
+                        last_page_offset);
+                        if (err)
+                                return err;
+        }
+        /* Now release the pages */
+        if (last_page_offset > first_page_offset) {
+                truncate_inode_pages_range(mapping, first_page_offset,
+                                           last_page_offset-1);
+        }
+        /* finish any pending end_io work */
+        ext4_flush_completed_IO(inode);
+        credits = ext4_writepage_trans_blocks(inode);
+        handle = ext4_journal_start(inode, credits);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        err = ext4_orphan_add(handle, inode);
+        if (err)
+                goto out;
+        /*
+         * Now we need to zero out the un block aligned data.
+         * If the file is smaller than a block, just
+         * zero out the middle
+         */
+        if (first_block > last_block)
+                ext4_block_zero_page_range(handle, mapping, offset, length);
+        else {
+                /* zero out the head of the hole before the first block */
+                block_len  = first_block_offset - offset;
+                if (block_len > 0)
+                        ext4_block_zero_page_range(handle, mapping,
+                                                   offset, block_len);
+                /* zero out the tail of the hole after the last block */
+                block_len = offset + length - last_block_offset;
+                if (block_len > 0) {
+                        ext4_block_zero_page_range(handle, mapping,
+                                        last_block_offset, block_len);
+                }
+        }
+        /* If there are no blocks to remove, return now */
+        if (first_block >= last_block)
+                goto out;
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_ext_invalidate_cache(inode);
+        ext4_discard_preallocations(inode);
+        /*
+         * Loop over all the blocks and identify blocks
+         * that need to be punched out
+         */
+        iblock = first_block;
+        blocks_released = 0;
+        while (iblock < last_block) {
+                max_blocks = last_block - iblock;
+                num_blocks = 1;
+                memset(&map, 0, sizeof(map));
+                map.m_lblk = iblock;
+                map.m_len = max_blocks;
+                ret = ext4_ext_map_blocks(handle, inode, &map,
+                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                if (ret > 0) {
+                        blocks_released += ret;
+                        num_blocks = ret;
+                } else if (ret == 0) {
+                        /*
+                         * If map blocks could not find the block,
+                         * then it is in a hole.  If the hole was
+                         * not already cached, then map blocks should
+                         * put it in the cache.  So we can get the hole
+                         * out of the cache
+                         */
+                        memset(&cache_ex, 0, sizeof(cache_ex));
+                        if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+                                !cache_ex.ec_start) {
+                                /* The hole is cached */
+                                num_blocks = cache_ex.ec_block +
+                                cache_ex.ec_len - iblock;
+                        } else {
+                                /* The block could not be identified */
+                                err = -EIO;
+                                break;
+                        }
+                } else {
+                        /* Map blocks error */
+                        err = ret;
+                        break;
+                }
+                if (num_blocks == 0) {
+                        /* This condition should never happen */
+                        ext_debug("Block lookup failed");
+                        err = -EIO;
+                        break;
+                }
+                iblock += num_blocks;
+        }
+        if (blocks_released > 0) {
+                ext4_ext_invalidate_cache(inode);
+                ext4_discard_preallocations(inode);
+        }
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+        up_write(&EXT4_I(inode)->i_data_sem);
+out:
+        ext4_orphan_del(handle, inode);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+        return err;
+}
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        return error;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
 };
 const struct inode_operations ext4_file_inode_operations = {
-        .truncate       = ext4_truncate,
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
 #ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
 static void dump_completed_IO(struct inode * inode)
 {
-#ifdef  EXT4_DEBUG
+#ifdef  EXT4FS_DEBUG
        struct list_head *cur, *before, *after;
        ext4_io_end_t *io, *io0, *io1;
        unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret;
        tid_t commit_tid;
+        bool needs_barrier = false;
        J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
        }
        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-        if (jbd2_log_start_commit(journal, commit_tid)) {
+        if (journal->j_flags & JBD2_BARRIER &&
-                /*
+            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
-                 * When the journal is on a different device than the
+                needs_barrier = true;
-                 * fs data disk, we need to issue the barrier in
+        jbd2_log_start_commit(journal, commit_tid);
-                 * writeback mode.  (In ordered mode, the jbd2 layer
+        ret = jbd2_log_wait_commit(journal, commit_tid);
-                 * will take care of issuing the barrier.  In
+        if (needs_barrier)
-                 * data=journal, all of the data blocks are written to
-                 * the journal device.)
-                 */
-                if (ext4_should_writeback_data(inode) &&
-                    (journal->j_fs_dev != journal->j_dev) &&
-                    (journal->j_flags & JBD2_BARRIER))
-                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                                        NULL);
-                ret = jbd2_log_wait_commit(journal, commit_tid);
-        } else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 out:
        trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        while (target > 0) {
                count = target;
                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_meta_blocks(handle, inode,
+                current_block = ext4_new_meta_blocks(handle, inode, goal,
-                                                        goal, &count, err);
+                                                     0, &count, err);
                if (*err)
                        goto failed_out;
@@ -1930,7 +1930,7 @@ repeat:
         * We do still charge estimated metadata to the sb though;
         * we cannot afford to run out of free blocks.
         */
-        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+        if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
                                continue;
                        }
-                        if (PageWriteback(page))
+                        wait_on_page_writeback(page);
-                                wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));
                        if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
-                                vmtruncate(inode, isize);
+                                ext4_truncate_failed_write(inode);
                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
 int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
 {
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
+        struct inode *inode = mapping->host;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length)
+{
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, length, pos;
+        unsigned blocksize, max, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
                return -EINVAL;
        blocksize = inode->i_sb->s_blocksize;
-        length = blocksize - (offset & (blocksize - 1));
+        max = blocksize - (offset & (blocksize - 1));
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the block
+         */
+        if (length > max || length < 0)
+                length = max;
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
        if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 int ext4_can_truncate(struct inode *inode)
 {
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return 0;
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
 }
 /*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        if (!S_ISREG(inode->i_mode))
+                return -ENOTSUPP;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+                /* TODO: Add support for non extent hole punching */
+                return -ENOTSUPP;
+        }
+        return ext4_ext_punch_hole(file, offset, length);
+}
+/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
        /*
         * Figure out the offset within the block group inode table
         */
-        inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((inode->i_ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
-            (attr->ia_size < inode->i_size ||
+            (attr->ia_size < inode->i_size)) {
-             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                goto err_out;
                        }
                }
-                /* ext4_truncate will clear the flag */
-                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
-                        ext4_truncate(inode);
        }
-        if ((attr->ia_valid & ATTR_SIZE) &&
+        if (attr->ia_valid & ATTR_SIZE) {
-            attr->ia_size != i_size_read(inode))
+                if (attr->ia_size != i_size_read(inode)) {
-                rc = vmtruncate(inode, attr->ia_size);
+                        truncate_setsize(inode, attr->ia_size);
+                        ext4_truncate(inode);
+                } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                        ext4_truncate(inode);
+        }
        if (!rc) {
                setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out_unlock;
        }
        ret = 0;
-        if (PageMappedToDisk(page))
-                goto out_unlock;
+        lock_page(page);
+        wait_on_page_writeback(page);
+        if (PageMappedToDisk(page)) {
+                up_read(&inode->i_alloc_sem);
+                return VM_FAULT_LOCKED;
+        }
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
-        lock_page(page);
        /*
         * return if we have all the buffers mapped. This avoid
         * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (page_has_buffers(page)) {
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
                                        ext4_bh_unmapped)) {
-                        unlock_page(page);
+                        up_read(&inode->i_alloc_sem);
-                        goto out_unlock;
+                        return VM_FAULT_LOCKED;
                }
        }
        unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret < 0)
                goto out_unlock;
        ret = 0;
+        /*
+         * write_begin/end might have created a dirty page and someone
+         * could wander in and start the IO.  Make sure that hasn't
+         * happened.
+         */
+        lock_page(page);
+        wait_on_page_writeback(page);
+        up_read(&inode->i_alloc_sem);
+        return VM_FAULT_LOCKED;
 out_unlock:
        if (ret)
                ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        struct inode *inode;
        char *data;
        char *bitmap;
+        struct ext4_group_info *grinfo;
        mb_debug(1, "init page %lu\n", page->index);
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (first_group + i >= ngroups)
                        break;
+                grinfo = ext4_get_group_info(sb, first_group + i);
+                /*
+                 * If page is uptodate then we came here after online resize
+                 * which added some new uninitialized group info structs, so
+                 * we must skip all initialized uptodate buddies on the page,
+                 * which may be currently in use by an allocating task.
+                 */
+                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+                        bh[i] = NULL;
+                        continue;
+                }
                err = -EIO;
                desc = ext4_get_group_desc(sb, first_group + i, NULL);
                if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        }
        /* wait for I/O completion */
-        for (i = 0; i < groups_per_page && bh[i]; i++)
+        for (i = 0; i < groups_per_page; i++)
-                wait_on_buffer(bh[i]);
+                if (bh[i])
+                        wait_on_buffer(bh[i]);
        err = -EIO;
-        for (i = 0; i < groups_per_page && bh[i]; i++)
+        for (i = 0; i < groups_per_page; i++)
-                if (!buffer_uptodate(bh[i]))
+                if (bh[i] && !buffer_uptodate(bh[i]))
                        goto out;
        err = 0;
        first_block = page->index * blocks_per_page;
-        /* init the page  */
-        memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
        for (i = 0; i < blocks_per_page; i++) {
                int group;
-                struct ext4_group_info *grinfo;
                group = (first_block + i) >> 1;
                if (group >= ngroups)
                        break;
+                if (!bh[group - first_group])
+                        /* skip initialized uptodate buddy */
+                        continue;
                /*
                 * data carry information regarding this
                 * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                         * incore got set to the group block bitmap below
                         */
                        ext4_lock_group(sb, group);
+                        /* init the buddy */
+                        memset(data, 0xff, blocksize);
                        ext4_mb_generate_buddy(sb, data, incore, group);
                        ext4_unlock_group(sb, group);
                        incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 out:
        if (bh) {
-                for (i = 0; i < groups_per_page && bh[i]; i++)
+                for (i = 0; i < groups_per_page; i++)
                        brelse(bh[i]);
                if (bh != &bhs)
                        kfree(bh);
@@ -957,22 +974,21 @@ out:
 }
 /*
- * lock the group_info alloc_sem of all the groups
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
- * belonging to the same buddy cache page. This
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
- * make sure other parallel operation on the buddy
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * cache doesn't happen  whild holding the buddy cache
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
- * lock
 */
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
-                                        ext4_group_t group)
+                ext4_group_t group, struct ext4_buddy *e4b)
 {
-        int i;
+        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
-        int block, pnum;
+        int block, pnum, poff;
        int blocks_per_page;
-        int groups_per_page;
+        struct page *page;
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        ext4_group_t first_group;
+        e4b->bd_buddy_page = NULL;
-        struct ext4_group_info *grp;
+        e4b->bd_bitmap_page = NULL;
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        /*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
         */
        block = group * 2;
        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        groups_per_page = blocks_per_page >> 1;
+        if (!page)
-        if (groups_per_page == 0)
+                return -EIO;
-                groups_per_page = 1;
+        BUG_ON(page->mapping != inode->i_mapping);
-        /* read all groups the page covers into the cache */
+        e4b->bd_bitmap_page = page;
-        for (i = 0; i < groups_per_page; i++) {
+        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
-                if ((first_group + i) >= ngroups)
+        if (blocks_per_page >= 2) {
-                        break;
+                /* buddy and bitmap are on the same page */
-                grp = ext4_get_group_info(sb, first_group + i);
+                return 0;
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                down_write_nested(&grp->alloc_sem, i);
        }
-        return i;
+        block++;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (!page)
+                return -EIO;
+        BUG_ON(page->mapping != inode->i_mapping);
+        e4b->bd_buddy_page = page;
+        return 0;
 }
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
-                                         ext4_group_t group, int locked_group)
 {
-        int i;
+        if (e4b->bd_bitmap_page) {
-        int block, pnum;
+                unlock_page(e4b->bd_bitmap_page);
-        int blocks_per_page;
+                page_cache_release(e4b->bd_bitmap_page);
-        ext4_group_t first_group;
+        }
-        struct ext4_group_info *grp;
+        if (e4b->bd_buddy_page) {
+                unlock_page(e4b->bd_buddy_page);
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+                page_cache_release(e4b->bd_buddy_page);
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        /* release locks on all the groups */
-        for (i = 0; i < locked_group; i++) {
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                up_write(&grp->alloc_sem);
        }
 }
 /*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
-        int ret = 0;
-        void *bitmap;
-        int blocks_per_page;
-        int block, pnum, poff;
-        int num_grp_locked = 0;
        struct ext4_group_info *this_grp;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_buddy e4b;
-        struct inode *inode = sbi->s_buddy_cache;
+        struct page *page;
-        struct page *page = NULL, *bitmap_page = NULL;
+        int ret = 0;
        mb_debug(1, "init group %u\n", group);
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        this_grp = ext4_get_group_info(sb, group);
        /*
         * This ensures that we don't reinit the buddy cache
         * page which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
-         * would have taken the alloc_sem lock.
+         * would have pinned buddy page to page cache.
         */
-        num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
-        if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
-                ret = 0;
                goto err;
        }
-        /*
-         * the buddy cache inode stores the block bitmap
+        page = e4b.bd_bitmap_page;
-         * and buddy information in consecutive blocks.
+        ret = ext4_mb_init_cache(page, NULL);
-         * So for each group we need two blocks.
+        if (ret)
-         */
+                goto err;
-        block = group * 2;
+        if (!PageUptodate(page)) {
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page) {
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, NULL);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
-        }
-        if (page == NULL || !PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
-        bitmap_page = page;
-        bitmap = page_address(page) + (poff * sb->s_blocksize);
-        /* init buddy cache */
+        if (e4b.bd_buddy_page == NULL) {
-        block++;
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page == bitmap_page) {
                /*
                 * If both the bitmap and buddy are in
                 * the same page we don't need to force
                 * init the buddy
                 */
-                unlock_page(page);
+                ret = 0;
-        } else if (page) {
+                goto err;
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, bitmap);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
        }
-        if (page == NULL || !PageUptodate(page)) {
+        /* init buddy cache */
+        page = e4b.bd_buddy_page;
+        ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+        if (ret)
+                goto err;
+        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
 err:
-        ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+        ext4_mb_put_buddy_page_lock(&e4b);
-        if (bitmap_page)
-                page_cache_release(bitmap_page);
-        if (page)
-                page_cache_release(page);
        return ret;
 }
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;
-        e4b->alloc_semp = &grp->alloc_sem;
-        /* Take the read lock on the group alloc
-         * sem. This would make sure a parallel
-         * ext4_mb_init_group happening on other
-         * groups mapped by the page is blocked
-         * till we are done with allocation
-         */
-repeat_load_buddy:
-        down_read(e4b->alloc_semp);
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-                /* we need to check for group need init flag
-                 * with alloc_semp held so that we can be sure
-                 * that new blocks didn't get added to the group
-                 * when we are loading the buddy cache
-                 */
-                up_read(e4b->alloc_semp);
                /*
                 * we need full data about the group
                 * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
                ret = ext4_mb_init_group(sb, group);
                if (ret)
                        return ret;
-                goto repeat_load_buddy;
        }
        /*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
        return 0;
 err:
+        if (page)
+                page_cache_release(page);
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
-        /* Done with the buddy cache */
-        up_read(e4b->alloc_semp);
        return ret;
 }
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
-        /* Done with the buddy cache */
-        if (e4b->alloc_semp)
-                up_read(e4b->alloc_semp);
 }
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
        get_page(ac->ac_bitmap_page);
        ac->ac_buddy_page = e4b->bd_buddy_page;
        get_page(ac->ac_buddy_page);
-        /* on allocation we use ac to track the held semaphore */
-        ac->alloc_semp =  e4b->alloc_semp;
-        e4b->alloc_semp = NULL;
        /* store last allocated for subsequent stream allocation */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-        int err, ret, count = 0, count2 = 0;
+        int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD)) {
+                if (test_opt(sb, DISCARD))
-                        ret = ext4_issue_discard(sb, entry->group,
+                        ext4_issue_discard(sb, entry->group,
-                                        entry->start_blk, entry->count);
+                                           entry->start_blk, entry->count);
-                        if (unlikely(ret == -EOPNOTSUPP)) {
-                                ext4_warning(sb, "discard not supported, "
-                                                 "disabling");
-                                clear_opt(sb, DISCARD);
-                        }
-                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                        spin_unlock(&pa->pa_lock);
                }
        }
-        if (ac->alloc_semp)
-                up_read(ac->alloc_semp);
        if (pa) {
                /*
                 * We want to add the pa to the right bucket.
                 * Remove it from the list and while adding
                 * make sure the list to which we are adding
-                 * doesn't grow big.  We need to release
+                 * doesn't grow big.
-                 * alloc_semp before calling ext4_mb_add_n_trim()
                 */
                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
                        spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                 * there is enough free blocks to do block allocation
                 * and verify allocation doesn't exceed the quota limits.
                 */
-                while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+                while (ar->len &&
+                        ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
                        /* let others to free the space */
                        yield();
                        ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                        return 0;
                }
                reserv_blks = ar->len;
-                while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
+                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
-                        ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                        dquot_alloc_block_nofail(ar->inode, ar->len);
-                        ar->len--;
+                } else {
+                        while (ar->len &&
+                                dquot_alloc_block(ar->inode, ar->len)) {
+                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                                ar->len--;
+                        }
                }
                inquota = ar->len;
                if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
 }
 /**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:                     handle to this transaction
+ * @sb:                         super block
+ * @block:                      start physcial block to add to the block group
+ * @count:                      number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                         ext4_fsblk_t block, unsigned long count)
+{
+        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *gd_bh;
+        ext4_group_t block_group;
+        ext4_grpblk_t bit;
+        unsigned int i;
+        struct ext4_group_desc *desc;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_buddy e4b;
+        int err = 0, ret, blk_free_count;
+        ext4_grpblk_t blocks_freed;
+        struct ext4_group_info *grp;
+        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+        grp = ext4_get_group_info(sb, block_group);
+        /*
+         * Check to see if we are freeing blocks across a group
+         * boundary.
+         */
+        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+                goto error_return;
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+        if (!bitmap_bh)
+                goto error_return;
+        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+        if (!desc)
+                goto error_return;
+        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+            in_range(block + count - 1, ext4_inode_table(sb, desc),
+                     sbi->s_itb_per_group)) {
+                ext4_error(sb, "Adding blocks in system zones - "
+                           "Block = %llu, count = %lu",
+                           block, count);
+                goto error_return;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting write access");
+        err = ext4_journal_get_write_access(handle, bitmap_bh);
+        if (err)
+                goto error_return;
+        /*
+         * We are about to modify some metadata.  Call the journal APIs
+         * to unshare ->b_data if a currently-committing transaction is
+         * using it
+         */
+        BUFFER_TRACE(gd_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, gd_bh);
+        if (err)
+                goto error_return;
+        for (i = 0, blocks_freed = 0; i < count; i++) {
+                BUFFER_TRACE(bitmap_bh, "clear bit");
+                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+                        ext4_error(sb, "bit already cleared for block %llu",
+                                   (ext4_fsblk_t)(block + i));
+                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                } else {
+                        blocks_freed++;
+                }
+        }
+        err = ext4_mb_load_buddy(sb, block_group, &e4b);
+        if (err)
+                goto error_return;
+        /*
+         * need to update group_info->bb_free and bitmap
+         * with group lock held. generate_buddy look at
+         * them with group lock_held
+         */
+        ext4_lock_group(sb, block_group);
+        mb_clear_bits(bitmap_bh->b_data, bit, count);
+        mb_free_blocks(NULL, &e4b, bit, count);
+        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+        ext4_free_blks_set(sb, desc, blk_free_count);
+        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+        ext4_unlock_group(sb, block_group);
+        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                atomic_add(blocks_freed,
+                           &sbi->s_flex_groups[flex_group].free_blocks);
+        }
+        ext4_mb_unload_buddy(&e4b);
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+        /* And the group descriptor block */
+        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+        if (!err)
+                err = ret;
+error_return:
+        brelse(bitmap_bh);
+        ext4_std_error(sb, err);
+        return;
+}
+/**
 * ext4_trim_extent -- function to TRIM one single free extent in the group
 * @sb:         super block for the file system
 * @start:      starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
 * one will allocate those blocks, mark it as used in buddy bitmap. This must
 * be called with under the group lock.
 */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
-                ext4_group_t group, struct ext4_buddy *e4b)
+                             ext4_group_t group, struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex;
-        int ret = 0;
        assert_spin_locked(ext4_group_lock_ptr(sb, group));
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
+        ext4_issue_discard(sb, group, start, count);
-        ret = ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
-        return ret;
 }
 /**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 * the group buddy bitmap. This is done until whole group is scanned.
 */
 static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
-                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+                   ext4_grpblk_t start, ext4_grpblk_t max,
+                   ext4_grpblk_t minblocks)
 {
        void *bitmap;
        ext4_grpblk_t next, count = 0;
-        ext4_group_t group;
+        struct ext4_buddy e4b;
-        int ret = 0;
+        int ret;
-        BUG_ON(e4b == NULL);
+        ret = ext4_mb_load_buddy(sb, group, &e4b);
+        if (ret) {
+                ext4_error(sb, "Error in loading buddy "
+                                "information for %u", group);
+                return ret;
+        }
+        bitmap = e4b.bd_bitmap;
-        bitmap = e4b->bd_bitmap;
-        group = e4b->bd_group;
-        start = (e4b->bd_info->bb_first_free > start) ?
-                e4b->bd_info->bb_first_free : start;
        ext4_lock_group(sb, group);
+        start = (e4b.bd_info->bb_first_free > start) ?
+                e4b.bd_info->bb_first_free : start;
        while (start < max) {
                start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                next = mb_find_next_bit(bitmap, max, start);
                if ((next - start) >= minblocks) {
-                        ret = ext4_trim_extent(sb, start,
+                        ext4_trim_extent(sb, start,
-                                next - start, group, e4b);
+                                         next - start, group, &e4b);
-                        if (ret < 0)
-                                break;
                        count += next - start;
                }
                start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                        ext4_lock_group(sb, group);
                }
-                if ((e4b->bd_info->bb_free - count) < minblocks)
+                if ((e4b.bd_info->bb_free - count) < minblocks)
                        break;
        }
        ext4_unlock_group(sb, group);
+        ext4_mb_unload_buddy(&e4b);
        ext4_debug("trimmed %d blocks in the group %d\n",
                count, group);
-        if (ret < 0)
-                count = ret;
        return count;
 }
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
 */
 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
-        struct ext4_buddy e4b;
+        struct ext4_group_info *grp;
        ext4_group_t first_group, last_group;
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_block, last_block;
-        uint64_t start, len, minlen, trimmed;
+        uint64_t start, len, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        start = range->start >> sb->s_blocksize_bits;
        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        trimmed = 0;
        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                return -EINVAL;
        for (group = first_group; group <= last_group; group++) {
-                ret = ext4_mb_load_buddy(sb, group, &e4b);
+                grp = ext4_get_group_info(sb, group);
-                if (ret) {
+                /* We only do this if the grp has never been initialized */
-                        ext4_error(sb, "Error in loading buddy "
+                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-                                        "information for %u", group);
+                        ret = ext4_mb_init_group(sb, group);
-                        break;
+                        if (ret)
+                                break;
                }
                /*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                        last_block = first_block + len;
                len -= last_block - first_block;
-                if (e4b.bd_info->bb_free >= minlen) {
+                if (grp->bb_free >= minlen) {
-                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                        cnt = ext4_trim_all_free(sb, group, first_block,
                                                last_block, minlen);
                        if (cnt < 0) {
                                ret = cnt;
-                                ext4_mb_unload_buddy(&e4b);
                                break;
                        }
                }
-                ext4_mb_unload_buddy(&e4b);
                trimmed += cnt;
                first_block = 0;
        }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
        __u8 ac_op;             /* operation, for history only */
        struct page *ac_bitmap_page;
        struct page *ac_buddy_page;
-        /*
-         * pointer to the held semaphore upon successful
-         * block allocation
-         */
-        struct rw_semaphore *alloc_semp;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
 };
@@ -215,7 +210,6 @@ struct ext4_buddy {
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
-        struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
         */
-        ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
        memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
        /*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+#include "ext4.h"
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+        mark_buffer_dirty(bh);
+        lock_buffer(bh);
+        bh->b_end_io = end_buffer_write_sync;
+        get_bh(bh);
+        submit_bh(WRITE_SYNC, bh);
+        wait_on_buffer(bh);
+        if (unlikely(!buffer_uptodate(bh)))
+                return 1;
+        return 0;
+}
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+                          ext4_fsblk_t mmp_block)
+{
+        struct mmp_struct *mmp;
+        if (*bh)
+                clear_buffer_uptodate(*bh);
+        /* This would be sb_bread(sb, mmp_block), except we need to be sure
+         * that the MD RAID device cache has been bypassed, and that the read
+         * is not blocked in the elevator. */
+        if (!*bh)
+                *bh = sb_getblk(sb, mmp_block);
+        if (*bh) {
+                get_bh(*bh);
+                lock_buffer(*bh);
+                (*bh)->b_end_io = end_buffer_read_sync;
+                submit_bh(READ_SYNC, *bh);
+                wait_on_buffer(*bh);
+                if (!buffer_uptodate(*bh)) {
+                        brelse(*bh);
+                        *bh = NULL;
+                }
+        }
+        if (!*bh) {
+                ext4_warning(sb, "Error while reading MMP block %llu",
+                             mmp_block);
+                return -EIO;
+        }
+        mmp = (struct mmp_struct *)((*bh)->b_data);
+        if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+                return -EINVAL;
+        return 0;
+}
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+                    const char *function, unsigned int line, const char *msg)
+{
+        __ext4_warning(sb, function, line, msg);
+        __ext4_warning(sb, function, line,
+                       "MMP failure info: last update time: %llu, last update "
+                       "node: %s, last update device: %s\n",
+                       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+                       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+        struct super_block *sb = ((struct mmpd_data *) data)->sb;
+        struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        struct mmp_struct *mmp;
+        ext4_fsblk_t mmp_block;
+        u32 seq = 0;
+        unsigned long failed_writes = 0;
+        int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+        unsigned mmp_check_interval;
+        unsigned long last_update_time;
+        unsigned long diff;
+        int retval;
+        mmp_block = le64_to_cpu(es->s_mmp_block);
+        mmp = (struct mmp_struct *)(bh->b_data);
+        mmp->mmp_time = cpu_to_le64(get_seconds());
+        /*
+         * Start with the higher mmp_check_interval and reduce it if
+         * the MMP block is being updated on time.
+         */
+        mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+                                 EXT4_MMP_MIN_CHECK_INTERVAL);
+        mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+        bdevname(bh->b_bdev, mmp->mmp_bdevname);
+        memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+               sizeof(mmp->mmp_nodename));
+        while (!kthread_should_stop()) {
+                if (++seq > EXT4_MMP_SEQ_MAX)
+                        seq = 1;
+                mmp->mmp_seq = cpu_to_le32(seq);
+                mmp->mmp_time = cpu_to_le64(get_seconds());
+                last_update_time = jiffies;
+                retval = write_mmp_block(bh);
+                /*
+                 * Don't spew too many error messages. Print one every
+                 * (s_mmp_update_interval * 60) seconds.
+                 */
+                if (retval && (failed_writes % 60) == 0) {
+                        ext4_error(sb, "Error writing to MMP block");
+                        failed_writes++;
+                }
+                if (!(le32_to_cpu(es->s_feature_incompat) &
+                    EXT4_FEATURE_INCOMPAT_MMP)) {
+                        ext4_warning(sb, "kmmpd being stopped since MMP feature"
+                                     " has been disabled.");
+                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto failed;
+                }
+                if (sb->s_flags & MS_RDONLY) {
+                        ext4_warning(sb, "kmmpd being stopped since filesystem "
+                                     "has been remounted as readonly.");
+                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto failed;
+                }
+                diff = jiffies - last_update_time;
+                if (diff < mmp_update_interval * HZ)
+                        schedule_timeout_interruptible(mmp_update_interval *
+                                                       HZ - diff);
+                /*
+                 * We need to make sure that more than mmp_check_interval
+                 * seconds have not passed since writing. If that has happened
+                 * we need to check if the MMP block is as we left it.
+                 */
+                diff = jiffies - last_update_time;
+                if (diff > mmp_check_interval * HZ) {
+                        struct buffer_head *bh_check = NULL;
+                        struct mmp_struct *mmp_check;
+                        retval = read_mmp_block(sb, &bh_check, mmp_block);
+                        if (retval) {
+                                ext4_error(sb, "error reading MMP data: %d",
+                                           retval);
+                                EXT4_SB(sb)->s_mmp_tsk = NULL;
+                                goto failed;
+                        }
+                        mmp_check = (struct mmp_struct *)(bh_check->b_data);
+                        if (mmp->mmp_seq != mmp_check->mmp_seq ||
+                            memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+                                   sizeof(mmp->mmp_nodename))) {
+                                dump_mmp_msg(sb, mmp_check,
+                                             "Error while updating MMP info. "
+                                             "The filesystem seems to have been"
+                                             " multiply mounted.");
+                                ext4_error(sb, "abort");
+                                goto failed;
+                        }
+                        put_bh(bh_check);
+                }
+                 /*
+                 * Adjust the mmp_check_interval depending on how much time
+                 * it took for the MMP block to be written.
+                 */
+                mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+                                             EXT4_MMP_MAX_CHECK_INTERVAL),
+                                         EXT4_MMP_MIN_CHECK_INTERVAL);
+                mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+        }
+        /*
+         * Unmount seems to be clean.
+         */
+        mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+        mmp->mmp_time = cpu_to_le64(get_seconds());
+        retval = write_mmp_block(bh);
+failed:
+        kfree(data);
+        brelse(bh);
+        return retval;
+}
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+        u32 new_seq;
+        do {
+                get_random_bytes(&new_seq, sizeof(u32));
+        } while (new_seq > EXT4_MMP_SEQ_MAX);
+        return new_seq;
+}
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+                                    ext4_fsblk_t mmp_block)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        struct buffer_head *bh = NULL;
+        struct mmp_struct *mmp = NULL;
+        struct mmpd_data *mmpd_data;
+        u32 seq;
+        unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+        unsigned int wait_time = 0;
+        int retval;
+        if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+            mmp_block >= ext4_blocks_count(es)) {
+                ext4_warning(sb, "Invalid MMP block in superblock");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+                mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+        /*
+         * If check_interval in MMP block is larger, use that instead of
+         * update_interval from the superblock.
+         */
+        if (mmp->mmp_check_interval > mmp_check_interval)
+                mmp_check_interval = mmp->mmp_check_interval;
+        seq = le32_to_cpu(mmp->mmp_seq);
+        if (seq == EXT4_MMP_SEQ_CLEAN)
+                goto skip;
+        if (seq == EXT4_MMP_SEQ_FSCK) {
+                dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+                goto failed;
+        }
+        wait_time = min(mmp_check_interval * 2 + 1,
+                        mmp_check_interval + 60);
+        /* Print MMP interval if more than 20 secs. */
+        if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+                ext4_warning(sb, "MMP interval %u higher than expected, please"
+                             " wait.\n", wait_time * 2);
+        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+                ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (seq != le32_to_cpu(mmp->mmp_seq)) {
+                dump_mmp_msg(sb, mmp,
+                             "Device is already active on another node.");
+                goto failed;
+        }
+skip:
+        /*
+         * write a new random sequence number.
+         */
+        mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+        retval = write_mmp_block(bh);
+        if (retval)
+                goto failed;
+        /*
+         * wait for MMP interval and check mmp_seq.
+         */
+        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+                ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (seq != le32_to_cpu(mmp->mmp_seq)) {
+                dump_mmp_msg(sb, mmp,
+                             "Device is already active on another node.");
+                goto failed;
+        }
+        mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+        if (!mmpd_data) {
+                ext4_warning(sb, "not enough memory for mmpd_data");
+                goto failed;
+        }
+        mmpd_data->sb = sb;
+        mmpd_data->bh = bh;
+        /*
+         * Start a kernel thread to update the MMP block periodically.
+         */
+        EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+                                             bdevname(bh->b_bdev,
+                                                      mmp->mmp_bdevname));
+        if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+                EXT4_SB(sb)->s_mmp_tsk = NULL;
+                kfree(mmpd_data);
+                ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+                             sb->s_id);
+                goto failed;
+        }
+        return 0;
+failed:
+        brelse(bh);
+        return 1;
+}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         * It needs to call wait_on_page_writeback() to wait for the
         * writeback of the page.
         */
-        if (PageWriteback(page))
+        wait_on_page_writeback(page);
-                wait_on_page_writeback(page);
        /* Release old bh and drop refs */
        try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        frame->at = entries;
        frame->bh = bh;
        bh = bh2;
+        ext4_handle_dirty_metadata(handle, dir, frame->bh);
+        ext4_handle_dirty_metadata(handle, dir, bh);
        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-        dx_release (frames);
+        if (!de) {
-        if (!(de))
+                /*
+                 * Even if the block split failed, we have to properly write
+                 * out all the changes we did so far. Otherwise we can end up
+                 * with corrupted filesystem.
+                 */
+                ext4_mark_inode_dirty(handle, dir);
+                dx_release(frames);
                return retval;
+        }
+        dx_release(frames);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
        handle_t *handle;
        struct inode *inode;
        int l, err, retries = 0;
+        int credits;
        l = strlen(symname)+1;
        if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
        dquot_initialize(dir);
+        if (l > EXT4_N_BLOCKS * 4) {
+                /*
+                 * For non-fast symlinks, we just allocate inode and put it on
+                 * orphan list in the first transaction => we need bitmap,
+                 * group descriptor, sb, inode block, quota blocks.
+                 */
+                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        } else {
+                /*
+                 * Fast symlink. We have to add entry to directory
+                 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+                 * allocate new inode (bitmap, group descriptor, inode block,
+                 * quota blocks, sb is already counted in previous macros).
+                 */
+                credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                          EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                          EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        }
 retry:
-        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+        handle = ext4_journal_start(dir, credits);
-                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2263,21 +2292,44 @@ retry:
        if (IS_ERR(inode))
                goto out_stop;
-        if (l > sizeof(EXT4_I(inode)->i_data)) {
+        if (l > EXT4_N_BLOCKS * 4) {
                inode->i_op = &ext4_symlink_inode_operations;
                ext4_set_aops(inode);
                /*
-                 * page_symlink() calls into ext4_prepare/commit_write.
+                 * We cannot call page_symlink() with transaction started
-                 * We have a transaction open.  All is sweetness.  It also sets
+                 * because it calls into ext4_write_begin() which can wait
-                 * i_size in generic_commit_write().
+                 * for transaction commit if we are running out of space
+                 * and thus we deadlock. So we have to stop transaction now
+                 * and restart it when symlink contents is written.
+                 * 
+                 * To keep fs consistent in case of crash, we have to put inode
+                 * to orphan list in the mean time.
                 */
+                drop_nlink(inode);
+                err = ext4_orphan_add(handle, inode);
+                ext4_journal_stop(handle);
+                if (err)
+                        goto err_drop_inode;
                err = __page_symlink(inode, symname, l, 1);
+                if (err)
+                        goto err_drop_inode;
+                /*
+                 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+                 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                 */
+                handle = ext4_journal_start(dir,
+                                EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+                if (IS_ERR(handle)) {
+                        err = PTR_ERR(handle);
+                        goto err_drop_inode;
+                }
+                inc_nlink(inode);
+                err = ext4_orphan_del(handle, inode);
                if (err) {
+                        ext4_journal_stop(handle);
                        clear_nlink(inode);
-                        unlock_new_inode(inode);
+                        goto err_drop_inode;
-                        ext4_mark_inode_dirty(handle, inode);
-                        iput(inode);
-                        goto out_stop;
                }
        } else {
                /* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
+err_drop_inode:
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
 }
 static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
        for (i = 0; i < io_end->num_io_pages; i++) {
                struct page *page = io_end->pages[i]->p_page;
                struct buffer_head *bh, *head;
-                int partial_write = 0;
+                loff_t offset;
+                loff_t io_end_offset;
-                head = page_buffers(page);
+                if (error) {
-                if (error)
                        SetPageError(page);
-                BUG_ON(!head);
+                        set_bit(AS_EIO, &page->mapping->flags);
-                if (head->b_size != PAGE_CACHE_SIZE) {
+                        head = page_buffers(page);
-                        loff_t offset;
+                        BUG_ON(!head);
-                        loff_t io_end_offset = io_end->offset + io_end->size;
+                        io_end_offset = io_end->offset + io_end->size;
                        offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
                        bh = head;
                        do {
                                if ((offset >= io_end->offset) &&
-                                    (offset+bh->b_size <= io_end_offset)) {
+                                    (offset+bh->b_size <= io_end_offset))
-                                        if (error)
+                                        buffer_io_error(bh);
-                                                buffer_io_error(bh);
-                                }
-                                if (buffer_delay(bh))
-                                        partial_write = 1;
-                                else if (!buffer_mapped(bh))
-                                        clear_buffer_dirty(bh);
-                                else if (buffer_dirty(bh))
-                                        partial_write = 1;
                                offset += bh->b_size;
                                bh = bh->b_this_page;
                        } while (bh != head);
                }
-                /*
-                 * If this is a partial write which happened to make
-                 * all buffers uptodate then we can optimize away a
-                 * bogus readpage() for the next read(). Here we
-                 * 'discover' whether the page went uptodate as a
-                 * result of this (potentially partial) write.
-                 */
-                if (!partial_write)
-                        SetPageUptodate(page);
                put_io_page(io_end->pages[i]);
        }
        io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb310af..cc5c157aa11d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,6 +38,7 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
 #include <linux/kthread.h>
@@ -75,11 +76,27 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
 static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext2",
+        .mount          = ext4_mount,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
@@ -806,6 +823,8 @@ static void ext4_put_super(struct super_block *sb)
                invalidate_bdev(sbi->journal_bdev);
                ext4_blkdev_remove(sbi);
        }
+        if (sbi->s_mmp_tsk)
+                kthread_stop(sbi->s_mmp_tsk);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
@@ -1096,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, INIT_INODE_TABLE))
                seq_puts(seq, ",noinit_inode_table");
-        else if (sbi->s_li_wait_mult)
+        else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
                seq_printf(seq, ",init_inode_table=%u",
                           (unsigned) sbi->s_li_wait_mult);
@@ -1187,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
 static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
        .get_reserved_space = ext4_get_reserved_space,
-#endif
        .write_dquot    = ext4_write_dquot,
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
@@ -1900,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                ext4_msg(sb, KERN_WARNING,
                         "warning: mounting fs with errors, "
                         "running e2fsck is recommended");
-        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                ext4_msg(sb, KERN_WARNING,
@@ -1932,6 +1949,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt, sbi->s_mount_opt2);
+        cleancache_init_fs(sb);
        return res;
 }
@@ -2425,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+                                      struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+                                        struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                          struct ext4_sb_info *sbi,
                                          const char *buf, size_t count)
@@ -2482,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2529,8 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
+        ATTR_LIST(extent_cache_hits),
+        ATTR_LIST(extent_cache_misses),
        ATTR_LIST(inode_readahead_blks),
        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
@@ -2659,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg)
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
-static void ext4_lazyinode_timeout(unsigned long data)
-{
-        struct task_struct *p = (struct task_struct *)data;
-        wake_up_process(p);
-}
 /* Find next suitable group and run ext4_init_inode_table */
 static int ext4_run_li_request(struct ext4_li_request *elr)
 {
@@ -2696,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
                ret = ext4_init_inode_table(sb, group,
                                            elr->lr_timeout ? 0 : 1);
                if (elr->lr_timeout == 0) {
-                        timeout = jiffies - timeout;
+                        timeout = (jiffies - timeout) *
-                        if (elr->lr_sbi->s_li_wait_mult)
+                                  elr->lr_sbi->s_li_wait_mult;
-                                timeout *= elr->lr_sbi->s_li_wait_mult;
-                        else
-                                timeout *= 20;
                        elr->lr_timeout = timeout;
                }
                elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 /*
 * Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
 */
 static void ext4_remove_li_request(struct ext4_li_request *elr)
 {
@@ -2730,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
 static void ext4_unregister_li_request(struct super_block *sb)
 {
-        struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+        mutex_lock(&ext4_li_mtx);
+        if (!ext4_li_info) {
-        if (!ext4_li_info)
+                mutex_unlock(&ext4_li_mtx);
                return;
+        }
        mutex_lock(&ext4_li_info->li_list_mtx);
-        ext4_remove_li_request(elr);
+        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
        mutex_unlock(&ext4_li_info->li_list_mtx);
+        mutex_unlock(&ext4_li_mtx);
 }
 static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg)
        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
        struct list_head *pos, *n;
        struct ext4_li_request *elr;
-        unsigned long next_wakeup;
+        unsigned long next_wakeup, cur;
-        DEFINE_WAIT(wait);
        BUG_ON(NULL == eli);
-        eli->li_timer.data = (unsigned long)current;
-        eli->li_timer.function = ext4_lazyinode_timeout;
-        eli->li_task = current;
-        wake_up(&eli->li_wait_task);
 cont_thread:
        while (true) {
                next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2817,15 @@ cont_thread:
                if (freezing(current))
                        refrigerator();
-                if ((time_after_eq(jiffies, next_wakeup)) ||
+                cur = jiffies;
+                if ((time_after_eq(cur, next_wakeup)) ||
                    (MAX_JIFFY_OFFSET == next_wakeup)) {
                        cond_resched();
                        continue;
                }
-                eli->li_timer.expires = next_wakeup;
+                schedule_timeout_interruptible(next_wakeup - cur);
-                add_timer(&eli->li_timer);
-                prepare_to_wait(&eli->li_wait_daemon, &wait,
-                                TASK_INTERRUPTIBLE);
-                if (time_before(jiffies, next_wakeup))
-                        schedule();
-                finish_wait(&eli->li_wait_daemon, &wait);
                if (kthread_should_stop()) {
                        ext4_clear_request_list();
                        goto exit_thread;
@@ -2833,12 +2849,7 @@ exit_thread:
                goto cont_thread;
        }
        mutex_unlock(&eli->li_list_mtx);
-        del_timer_sync(&ext4_li_info->li_timer);
-        eli->li_task = NULL;
-        wake_up(&eli->li_wait_task);
        kfree(ext4_li_info);
-        ext4_lazyinit_task = NULL;
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);
@@ -2866,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void)
        if (IS_ERR(ext4_lazyinit_task)) {
                int err = PTR_ERR(ext4_lazyinit_task);
                ext4_clear_request_list();
-                del_timer_sync(&ext4_li_info->li_timer);
                kfree(ext4_li_info);
                ext4_li_info = NULL;
                printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void)
                return err;
        }
        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-        wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
        return 0;
 }
@@ -2911,13 +2919,9 @@ static int ext4_li_info_new(void)
        if (!eli)
                return -ENOMEM;
-        eli->li_task = NULL;
        INIT_LIST_HEAD(&eli->li_request_list);
        mutex_init(&eli->li_list_mtx);
-        init_waitqueue_head(&eli->li_wait_daemon);
-        init_waitqueue_head(&eli->li_wait_task);
-        init_timer(&eli->li_timer);
        eli->li_state |= EXT4_LAZYINIT_QUIT;
        ext4_li_info = eli;
@@ -2960,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb,
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        int ret = 0;
-        if (sbi->s_li_request != NULL)
+        if (sbi->s_li_request != NULL) {
+                /*
+                 * Reset timeout so it can be computed again, because
+                 * s_li_wait_mult might have changed.
+                 */
+                sbi->s_li_request->lr_timeout = 0;
                return 0;
+        }
        if (first_not_zeroed == ngroups ||
            (sb->s_flags & MS_RDONLY) ||
-            !test_opt(sb, INIT_INODE_TABLE)) {
+            !test_opt(sb, INIT_INODE_TABLE))
-                sbi->s_li_request = NULL;
                return 0;
-        }
-        if (first_not_zeroed == ngroups) {
-                sbi->s_li_request = NULL;
-                return 0;
-        }
        elr = ext4_li_request_new(sb, first_not_zeroed);
        if (!elr)
@@ -3166,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sb, DELALLOC);
+        /*
+         * set default s_li_wait_mult for lazyinit, for the case there is
+         * no mount option specified.
+         */
+        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
                ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");
+        if (IS_EXT2_SB(sb)) {
+                if (ext2_feature_set_ok(sb))
+                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+                                 "using the ext4 subsystem");
+                else {
+                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+                                 "to feature incompatibilities");
+                        goto failed_mount;
+                }
+        }
+        if (IS_EXT3_SB(sb)) {
+                if (ext3_feature_set_ok(sb))
+                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+                                 "using the ext4 subsystem");
+                else {
+                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+                                 "to feature incompatibilities");
+                        goto failed_mount;
+                }
+        }
        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                          EXT4_HAS_INCOMPAT_FEATURE(sb,
                                    EXT4_FEATURE_INCOMPAT_RECOVER));
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+            !(sb->s_flags & MS_RDONLY))
+                if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+                        goto failed_mount3;
        /*
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
@@ -3474,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount_wq;
        } else {
                clear_opt(sb, DATA_FLAGS);
-                set_opt(sb, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3707,6 +3742,8 @@ failed_mount3:
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+        if (sbi->s_mmp_tsk)
+                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        int enable_quota = 0;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-        int err;
+        int err = 0;
 #ifdef CONFIG_QUOTA
        int i;
 #endif
@@ -4368,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                                     EXT4_FEATURE_INCOMPAT_MMP))
+                                if (ext4_multi_mount_protect(sb,
+                                                le64_to_cpu(es->s_mmp_block))) {
+                                        err = -EROFS;
+                                        goto restore_opts;
+                                }
                        enable_quota = 1;
                }
        }
@@ -4432,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        u64 fsid;
+        s64 bfree;
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
@@ -4475,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+        bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+        /* prevent underflow in case that few free space is available */
+        buf->f_bfree = max_t(s64, bfree, 0);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
@@ -4652,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
+        if (!inode)
+                goto out;
        /* Update modification times of quota files when userspace can
         * start looking at them */
        handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 }
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "ext2",
-        .mount          = ext4_mount,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
 static inline void register_as_ext2(void)
 {
        int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4834,22 @@ static inline void unregister_as_ext2(void)
 {
        unregister_filesystem(&ext2_fs_type);
 }
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+                return 0;
+        if (sb->s_flags & MS_RDONLY)
+                return 1;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+                return 0;
+        return 1;
+}
 MODULE_ALIAS("ext2");
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4865,24 @@ static inline void unregister_as_ext3(void)
 {
        unregister_filesystem(&ext3_fs_type);
 }
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+                return 0;
+        if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+                return 0;
+        if (sb->s_flags & MS_RDONLY)
+                return 1;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+                return 0;
+        return 1;
+}
 MODULE_ALIAS("ext3");
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4966,8 @@ static int __init ext4_init_fs(void)
        err = init_inodecache();
        if (err)
                goto out1;
-        register_as_ext2();
        register_as_ext3();
+        register_as_ext2();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-                        block = ext4_new_meta_blocks(handle, inode,
+                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
-                                                  goal, NULL, &error);
+                                                     NULL, &error);
                        if (error)
                                goto cleanup;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index ae8200f84e39..1cc7038e273d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -151,6 +151,13 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
                        spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
                        tmp = fat_cache_alloc(inode);
+                        if (!tmp) {
+                                spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+                                MSDOS_I(inode)->nr_caches--;
+                                spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+                                return;
+                        }
                        spin_lock(&MSDOS_I(inode)->cache_lru_lock);
                        cache = fat_cache_merge(inode, new);
                        if (cache != NULL) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ee42b9e0b16a..4ad64732cbce 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -98,7 +98,7 @@ next:
        *bh = sb_bread(sb, phys);
        if (*bh == NULL) {
-                printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
+                fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed",
                       (llu)phys);
                /* skip this block */
                *pos = (iblock + 1) << sb->s_blocksize_bits;
@@ -136,9 +136,10 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
 * but ignore that right now.
 * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
 */
-static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
+static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
-                       int uni_xlate, struct nls_table *nls)
+                       const wchar_t *uni, int len, struct nls_table *nls)
 {
+        int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
        const wchar_t *ip;
        wchar_t ec;
        unsigned char *op;
@@ -166,23 +167,23 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
        }
        if (unlikely(*ip)) {
-                printk(KERN_WARNING "FAT: filename was truncated while "
+                fat_msg(sb, KERN_WARNING, "filename was truncated while "
-                       "converting.");
+                        "converting.");
        }
        *op = 0;
        return (op - ascii);
 }
-static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
                                unsigned char *buf, int size)
 {
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        if (sbi->options.utf8)
                return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
                                UTF16_HOST_ENDIAN, buf, size);
        else
-                return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
+                return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
-                                   sbi->nls_io);
 }
 static inline int
@@ -419,7 +420,7 @@ parse_record:
                /* Compare shortname */
                bufuname[last_u] = 0x0000;
-                len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+                len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
                if (fat_name_match(sbi, name, name_len, bufname, len))
                        goto found;
@@ -428,7 +429,7 @@ parse_record:
                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;
                        /* Compare longname */
-                        len = fat_uni_to_x8(sbi, unicode, longname, size);
+                        len = fat_uni_to_x8(sb, unicode, longname, size);
                        if (fat_name_match(sbi, name, name_len, longname, len))
                                goto found;
                }
@@ -545,7 +546,7 @@ parse_record:
                if (nr_slots) {
                        void *longname = unicode + FAT_MAX_UNI_CHARS;
                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;
-                        int len = fat_uni_to_x8(sbi, unicode, longname, size);
+                        int len = fat_uni_to_x8(sb, unicode, longname, size);
                        fill_name = longname;
                        fill_len = len;
@@ -621,7 +622,7 @@ parse_record:
        if (isvfat) {
                bufuname[j] = 0x0000;
-                i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+                i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
        }
        if (nr_slots) {
                /* hack for fat_ioctl_filldir() */
@@ -979,6 +980,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 {
+        struct super_block *sb = dir->i_sb;
        struct msdos_dir_entry *de;
        struct buffer_head *bh;
        int err = 0, nr_slots;
@@ -1013,8 +1015,8 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
                 */
                err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
                if (err) {
-                        printk(KERN_WARNING
+                        fat_msg(sb, KERN_WARNING,
-                               "FAT: Couldn't remove the long name slots\n");
+                               "Couldn't remove the long name slots");
                }
        }
@@ -1265,7 +1267,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
                if (sbi->fat_bits != 32)
                        goto error;
        } else if (MSDOS_I(dir)->i_start == 0) {
-                printk(KERN_ERR "FAT: Corrupted directory (i_pos %lld)\n",
+                fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
                       MSDOS_I(dir)->i_pos);
                err = -EIO;
                goto error;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index f50408901f7e..8276cc282dec 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,19 +319,20 @@ extern struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos);
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-                        const struct inode_operations *fs_dir_inode_ops,
+                          int isvfat, void (*setup)(struct super_block *));
-                        int isvfat, void (*setup)(struct super_block *));
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
 extern void
-__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(sb, fmt, args...)          \
+        __fat_fs_error(sb, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(sb, fmt, args...) \
+        __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
        __attribute__ ((format (printf, 3, 4))) __cold;
-#define fat_fs_error(s, fmt, args...)           \
-        __fat_fs_error(s, 1, fmt , ## args)
-#define fat_fs_error_ratelimit(s, fmt, args...) \
-        __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index b47d2c9f4fa1..2e81ac0df7e2 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -95,7 +95,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 err_brelse:
        brelse(bhs[0]);
 err:
-        printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
+        fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
        return -EIO;
 }
@@ -108,7 +108,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        fatent->bhs[0] = sb_bread(sb, blocknr);
        if (!fatent->bhs[0]) {
-                printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
+                fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
                       (llu)blocknr);
                return -EIO;
        }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8d68690bdcf1..cb8d8391ac0b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -581,7 +581,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = sbi->free_clusters;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
+        buf->f_namelen =
+                (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
        return 0;
 }
@@ -619,8 +620,8 @@ retry:
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
-                printk(KERN_ERR "FAT: unable to read inode block "
+                fat_msg(sb, KERN_ERR, "unable to read inode block "
-                       "for updating (i_pos %lld)\n", i_pos);
+                       "for updating (i_pos %lld)", i_pos);
                return -EIO;
        }
        spin_lock(&sbi->inode_hash_lock);
@@ -976,8 +977,8 @@ static const match_table_t vfat_tokens = {
        {Opt_err, NULL}
 };
-static int parse_options(char *options, int is_vfat, int silent, int *debug,
+static int parse_options(struct super_block *sb, char *options, int is_vfat,
-                         struct fat_mount_options *opts)
+                         int silent, int *debug, struct fat_mount_options *opts)
 {
        char *p;
        substring_t args[MAX_OPT_ARGS];
@@ -1168,15 +1169,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                /* obsolete mount options */
                case Opt_obsolate:
-                        printk(KERN_INFO "FAT: \"%s\" option is obsolete, "
+                        fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
-                               "not supported now\n", p);
+                               "not supported now", p);
                        break;
                /* unknown option */
                default:
                        if (!silent) {
-                                printk(KERN_ERR
+                                fat_msg(sb, KERN_ERR,
-                                       "FAT: Unrecognized mount option \"%s\" "
+                                       "Unrecognized mount option \"%s\" "
-                                       "or missing value\n", p);
+                                       "or missing value", p);
                        }
                        return -EINVAL;
                }
@@ -1185,7 +1186,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 out:
        /* UTF-8 doesn't provide FAT semantics */
        if (!strcmp(opts->iocharset, "utf8")) {
-                printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
+                fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
                       " for FAT filesystems, filesystem will be "
                       "case sensitive!\n");
        }
@@ -1238,8 +1239,7 @@ static int fat_read_root(struct inode *inode)
 /*
 * Read the super block of an MS-DOS FS.
 */
-int fat_fill_super(struct super_block *sb, void *data, int silent,
+int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
-                   const struct inode_operations *fs_dir_inode_ops, int isvfat,
                   void (*setup)(struct super_block *))
 {
        struct inode *root_inode = NULL, *fat_inode = NULL;
@@ -1268,11 +1268,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_magic = MSDOS_SUPER_MAGIC;
        sb->s_op = &fat_sops;
        sb->s_export_op = &fat_export_ops;
-        sbi->dir_ops = fs_dir_inode_ops;
        ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
                             DEFAULT_RATELIMIT_BURST);
-        error = parse_options(data, isvfat, silent, &debug, &sbi->options);
+        error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
        if (error)
                goto out_fail;
@@ -1282,20 +1281,20 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb_min_blocksize(sb, 512);
        bh = sb_bread(sb, 0);
        if (bh == NULL) {
-                printk(KERN_ERR "FAT: unable to read boot sector\n");
+                fat_msg(sb, KERN_ERR, "unable to read boot sector");
                goto out_fail;
        }
        b = (struct fat_boot_sector *) bh->b_data;
        if (!b->reserved) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus number of reserved sectors\n");
+                        fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
                brelse(bh);
                goto out_invalid;
        }
        if (!b->fats) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus number of FAT structure\n");
+                        fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
                brelse(bh);
                goto out_invalid;
        }
@@ -1308,7 +1307,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        media = b->media;
        if (!fat_valid_media(media)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: invalid media value (0x%02x)\n",
+                        fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
                               media);
                brelse(bh);
                goto out_invalid;
@@ -1318,7 +1317,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
            || (logical_sector_size < 512)
            || (logical_sector_size > 4096)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus logical sector size %u\n",
+                        fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
                               logical_sector_size);
                brelse(bh);
                goto out_invalid;
@@ -1326,15 +1325,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sbi->sec_per_clus = b->sec_per_clus;
        if (!is_power_of_2(sbi->sec_per_clus)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus sectors per cluster %u\n",
+                        fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
                               sbi->sec_per_clus);
                brelse(bh);
                goto out_invalid;
        }
        if (logical_sector_size < sb->s_blocksize) {
-                printk(KERN_ERR "FAT: logical sector size too small for device"
+                fat_msg(sb, KERN_ERR, "logical sector size too small for device"
-                       " (logical sector size = %u)\n", logical_sector_size);
+                       " (logical sector size = %u)", logical_sector_size);
                brelse(bh);
                goto out_fail;
        }
@@ -1342,14 +1341,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
                brelse(bh);
                if (!sb_set_blocksize(sb, logical_sector_size)) {
-                        printk(KERN_ERR "FAT: unable to set blocksize %u\n",
+                        fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
                               logical_sector_size);
                        goto out_fail;
                }
                bh = sb_bread(sb, 0);
                if (bh == NULL) {
-                        printk(KERN_ERR "FAT: unable to read boot sector"
+                        fat_msg(sb, KERN_ERR, "unable to read boot sector"
-                               " (logical sector size = %lu)\n",
+                               " (logical sector size = %lu)",
                               sb->s_blocksize);
                        goto out_fail;
                }
@@ -1385,16 +1384,16 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
                fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
                if (fsinfo_bh == NULL) {
-                        printk(KERN_ERR "FAT: bread failed, FSINFO block"
+                        fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
-                               " (sector = %lu)\n", sbi->fsinfo_sector);
+                               " (sector = %lu)", sbi->fsinfo_sector);
                        brelse(bh);
                        goto out_fail;
                }
                fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
                if (!IS_FSINFO(fsinfo)) {
-                        printk(KERN_WARNING "FAT: Invalid FSINFO signature: "
+                        fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
-                               "0x%08x, 0x%08x (sector = %lu)\n",
+                               "0x%08x, 0x%08x (sector = %lu)",
                               le32_to_cpu(fsinfo->signature1),
                               le32_to_cpu(fsinfo->signature2),
                               sbi->fsinfo_sector);
@@ -1415,8 +1414,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
        if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus directroy-entries per block"
+                        fat_msg(sb, KERN_ERR, "bogus directroy-entries per block"
-                               " (%u)\n", sbi->dir_entries);
+                               " (%u)", sbi->dir_entries);
                brelse(bh);
                goto out_invalid;
        }
@@ -1438,7 +1437,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
        if (total_clusters > MAX_FAT(sb)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: count of clusters too big (%u)\n",
+                        fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
                               total_clusters);
                brelse(bh);
                goto out_invalid;
@@ -1471,7 +1470,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sprintf(buf, "cp%d", sbi->options.codepage);
        sbi->nls_disk = load_nls(buf);
        if (!sbi->nls_disk) {
-                printk(KERN_ERR "FAT: codepage %s not found\n", buf);
+                fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
                goto out_fail;
        }
@@ -1479,7 +1478,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        if (sbi->options.isvfat) {
                sbi->nls_io = load_nls(sbi->options.iocharset);
                if (!sbi->nls_io) {
-                        printk(KERN_ERR "FAT: IO charset %s not found\n",
+                        fat_msg(sb, KERN_ERR, "IO charset %s not found",
                               sbi->options.iocharset);
                        goto out_fail;
                }
@@ -1503,7 +1502,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        insert_inode_hash(root_inode);
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root) {
-                printk(KERN_ERR "FAT: get root inode failed\n");
+                fat_msg(sb, KERN_ERR, "get root inode failed");
                goto out_fail;
        }
@@ -1512,8 +1511,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 out_invalid:
        error = -EINVAL;
        if (!silent)
-                printk(KERN_INFO "VFS: Can't find a valid FAT filesystem"
+                fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
-                       " on dev %s.\n", sb->s_id);
 out_fail:
        if (fat_inode)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 970e682ea754..6d93360ca0cc 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,30 +20,46 @@
 * In case the file system is remounted read-only, it can be made writable
 * again by remounting it.
 */
-void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 {
-        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
+        struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
        va_list args;
+        struct va_format vaf;
        if (report) {
-                printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-                printk(KERN_ERR "    ");
                va_start(args, fmt);
-                vprintk(fmt, args);
+                vaf.fmt = fmt;
+                vaf.va = &args;
+                printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
                va_end(args);
-                printk("\n");
        }
        if (opts->errors == FAT_ERRORS_PANIC)
-                panic("FAT: fs panic from previous error\n");
+                panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
-        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
+        else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
-                s->s_flags |= MS_RDONLY;
+                sb->s_flags |= MS_RDONLY;
-                printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
+                printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
+                                "set read-only\n", sb->s_id);
        }
 }
 EXPORT_SYMBOL_GPL(__fat_fs_error);
+/**
+ * fat_msg() - print preformated FAT specific messages. Every thing what is
+ * not fat_fs_error() should be fat_msg().
+ */
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+        va_end(args);
+}
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
 int fat_clusters_flush(struct super_block *sb)
@@ -57,15 +73,15 @@ int fat_clusters_flush(struct super_block *sb)
        bh = sb_bread(sb, sbi->fsinfo_sector);
        if (bh == NULL) {
-                printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
+                fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
                return -EIO;
        }
        fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
        /* Sanity check */
        if (!IS_FSINFO(fsinfo)) {
-                printk(KERN_ERR "FAT: Invalid FSINFO signature: "
+                fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
-                       "0x%08x, 0x%08x (sector = %lu)\n",
+                       "0x%08x, 0x%08x (sector = %lu)",
                       le32_to_cpu(fsinfo->signature1),
                       le32_to_cpu(fsinfo->signature2),
                       sbi->fsinfo_sector);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 711499040eb6..be15437c272e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
+        dentry_unhash(dentry);
        lock_super(sb);
        /*
         * Check whether the directory is not in use, then check
@@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        err = fat_scan(old_dir, old_name, &old_sinfo);
        if (err) {
                err = -EIO;
@@ -659,14 +664,14 @@ static const struct inode_operations msdos_dir_inode_operations = {
 static void setup(struct super_block *sb)
 {
+        MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
        sb->s_d_op = &msdos_dentry_operations;
        sb->s_flags |= MS_NOATIME;
 }
 static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 {
-        return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
+        return fat_fill_super(sb, data, silent, 0, setup);
-                             0, setup);
 }
 static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index adae3fb7451a..c61a6789f36c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
+        dentry_unhash(dentry);
        lock_super(sb);
        err = fat_dir_empty(inode);
@@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        int err, is_dir, update_dotdot, corrupt = 0;
        struct super_block *sb = old_dir->i_sb;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
@@ -1065,6 +1070,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
 static void setup(struct super_block *sb)
 {
+        MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
        if (MSDOS_SB(sb)->options.name_check != 's')
                sb->s_d_op = &vfat_ci_dentry_ops;
        else
@@ -1073,8 +1079,7 @@ static void setup(struct super_block *sb)
 static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 {
-        return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
+        return fat_fill_super(sb, data, silent, 1, setup);
-                             1, setup);
 }
 static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 2ba6719ac612..1a4311437a8b 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -272,7 +272,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
 * *ip:                 VFS inode
 *
 * Description:
- *  vxfs_put_fake_inode frees all data asssociated with @ip.
+ *  vxfs_put_fake_inode frees all data associated with @ip.
 */
 void
 vxfs_put_fake_inode(struct inode *ip)
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 48a18f184d50..30afdfa7aec7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        _enter("{OBJ%x OP%x,%u}",
               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
-        fscache_set_op_state(op, "EnQ");
        ASSERT(list_empty(&op->pend_link));
        ASSERT(op->processor != NULL);
        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
@@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
 static void fscache_run_op(struct fscache_object *object,
                           struct fscache_operation *op)
 {
-        fscache_set_op_state(op, "Run");
        object->n_in_progress++;
        if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
                wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
        _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
-        fscache_set_op_state(op, "SubmitX");
        spin_lock(&object->lock);
        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object,
        ASSERTCMP(atomic_read(&op->usage), >, 0);
-        fscache_set_op_state(op, "Submit");
        spin_lock(&object->lock);
        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op)
        if (!atomic_dec_and_test(&op->usage))
                return;
-        fscache_set_op_state(op, "Put");
        _debug("PUT OP");
        if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
                BUG();
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 41c441c2058d..a2a5d19ece6a 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
        fscache_stat(&fscache_n_attr_changed_calls);
        if (fscache_object_is_active(object)) {
-                fscache_set_op_state(op, "CallFS");
                fscache_stat(&fscache_n_cop_attr_changed);
                ret = object->cache->ops->attr_changed(object);
                fscache_stat_d(&fscache_n_cop_attr_changed);
-                fscache_set_op_state(op, "Done");
                if (ret < 0)
                        fscache_abort_object(object);
        }
@@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
        fscache_operation_init(op, fscache_attr_changed_op, NULL);
        op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
-        fscache_set_op_name(op, "Attr");
        spin_lock(&cookie->lock);
@@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
        op->context     = context;
        op->start_time  = jiffies;
        INIT_LIST_HEAD(&op->to_do);
-        fscache_set_op_name(&op->op, "Retr");
        return op;
 }
@@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
                _leave(" = -ENOMEM");
                return -ENOMEM;
        }
-        fscache_set_op_name(&op->op, "RetrRA1");
        spin_lock(&cookie->lock);
@@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        op = fscache_alloc_retrieval(mapping, end_io_func, context);
        if (!op)
                return -ENOMEM;
-        fscache_set_op_name(&op->op, "RetrRAN");
        spin_lock(&cookie->lock);
@@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
        if (!op)
                return -ENOMEM;
-        fscache_set_op_name(&op->op, "RetrAL1");
        spin_lock(&cookie->lock);
@@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op)
        _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
-        fscache_set_op_state(&op->op, "GetPage");
        spin_lock(&object->lock);
        cookie = object->cookie;
@@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op)
        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
-        fscache_set_op_state(&op->op, "Store");
        fscache_stat(&fscache_n_store_pages);
        fscache_stat(&fscache_n_cop_write_page);
        ret = object->cache->ops->write_page(op, page);
        fscache_stat_d(&fscache_n_cop_write_page);
-        fscache_set_op_state(&op->op, "EndWrite");
        fscache_end_page_write(object, page);
        if (ret < 0) {
-                fscache_set_op_state(&op->op, "Abort");
                fscache_abort_object(object);
        } else {
                fscache_enqueue_operation(&op->op);
@@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_operation_init(&op->op, fscache_write_op,
                               fscache_release_write_op);
        op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
-        fscache_set_op_name(&op->op, "Write1");
        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
        if (ret < 0)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b32eb29a4e6f..0d0e3faddcfa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        if (IS_ERR(req))
                return PTR_ERR(req);
+        dentry_unhash(entry);
        req->in.h.opcode = FUSE_RMDIR;
        req->in.h.nodeid = get_node_id(dir);
        req->in.numargs = 1;
@@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        struct fuse_rename_in inarg;
        struct fuse_conn *fc = get_fuse_conn(olddir);
        struct fuse_req *req = fuse_get_req(fc);
+        if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
+                dentry_unhash(newent);
        if (IS_ERR(req))
                return PTR_ERR(req);
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index f3d23ef4e876..86128202384f 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
 ccflags-y := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
-        glops.o inode.o log.o lops.o main.o meta_io.o \
+        glops.o log.o lops.o main.o meta_io.o \
        aops.o dentry.o export.o file.o \
-        ops_fstype.o ops_inode.o quota.o \
+        ops_fstype.o inode.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0f5c4f9d5d62..802ac5eeba28 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1076,8 +1076,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
                bd = bh->b_private;
                if (bd && bd->bd_ail)
                        goto cannot_release;
-                gfs2_assert_warn(sdp, !buffer_pinned(bh));
+                if (buffer_pinned(bh) || buffer_dirty(bh))
-                gfs2_assert_warn(sdp, !buffer_dirty(bh));
+                        goto not_possible;
                bh = bh->b_this_page;
        } while(bh != head);
        gfs2_log_unlock(sdp);
@@ -1107,6 +1107,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
        } while (bh != head);
        return try_to_free_buffers(page);
+not_possible: /* Should never happen */
+        WARN_ON(buffer_dirty(bh));
+        WARN_ON(buffer_pinned(bh));
 cannot_release:
        gfs2_log_unlock(sdp);
        return 0;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 74add2ddcc3f..e65493a8ac00 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -780,6 +780,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        metadata = (height != ip->i_height - 1);
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+        else if (ip->i_depth)
+                revokes = sdp->sd_inptrs;
        if (ip != GFS2_I(sdp->sd_rindex))
                error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index f789c5732b7c..091ee4779538 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,12 +82,9 @@
 struct qstr gfs2_qdot __read_mostly;
 struct qstr gfs2_qdotdot __read_mostly;
-typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
-                            u64 leaf_no, void *data);
 typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
                            const struct qstr *name, void *opaque);
 int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
                            struct buffer_head **bhp)
 {
@@ -1600,7 +1597,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 */
 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
-                 const struct gfs2_inode *nip, unsigned type)
+                 const struct gfs2_inode *nip)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct buffer_head *bh;
@@ -1616,7 +1613,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                                return PTR_ERR(dent);
                        dent = gfs2_init_dirent(inode, dent, name, bh);
                        gfs2_inum_out(nip, dent);
-                        dent->de_type = cpu_to_be16(type);
+                        dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
                        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
                                be16_add_cpu(&leaf->lf_entries, 1);
@@ -1628,6 +1625,8 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
                        ip->i_entries++;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+                        if (S_ISDIR(nip->i_inode.i_mode))
+                                inc_nlink(&ip->i_inode);
                        gfs2_dinode_out(ip, bh->b_data);
                        brelse(bh);
                        error = 0;
@@ -1672,8 +1671,9 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 * Returns: 0 on success, error code on failure
 */
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 {
+        const struct qstr *name = &dentry->d_name;
        struct gfs2_dirent *dent, *prev = NULL;
        struct buffer_head *bh;
        int error;
@@ -1714,6 +1714,8 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
        gfs2_trans_add_bh(dip->i_gl, bh, 1);
        dip->i_entries--;
        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+        if (S_ISDIR(dentry->d_inode->i_mode))
+                drop_nlink(&dip->i_inode);
        gfs2_dinode_out(dip, bh->b_data);
        brelse(bh);
        mark_inode_dirty(&dip->i_inode);
@@ -1768,94 +1770,20 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 }
 /**
- * foreach_leaf - call a function for each leaf in a directory
- * @dip: the directory
- * @lc: the function to call for each each
- * @data: private data to pass to it
- *
- * Returns: errno
- */
-static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct buffer_head *bh;
-        struct gfs2_leaf *leaf;
-        u32 hsize, len;
-        u32 ht_offset, lp_offset, ht_offset_cur = -1;
-        u32 index = 0;
-        __be64 *lp;
-        u64 leaf_no;
-        int error = 0;
-        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
-                gfs2_consist_inode(dip);
-                return -EIO;
-        }
-        lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
-        if (!lp)
-                return -ENOMEM;
-        while (index < hsize) {
-                lp_offset = index & (sdp->sd_hash_ptrs - 1);
-                ht_offset = index - lp_offset;
-                if (ht_offset_cur != ht_offset) {
-                        error = gfs2_dir_read_data(dip, (char *)lp,
-                                                ht_offset * sizeof(__be64),
-                                                sdp->sd_hash_bsize, 1);
-                        if (error != sdp->sd_hash_bsize) {
-                                if (error >= 0)
-                                        error = -EIO;
-                                goto out;
-                        }
-                        ht_offset_cur = ht_offset;
-                }
-                leaf_no = be64_to_cpu(lp[lp_offset]);
-                if (leaf_no) {
-                        error = get_leaf(dip, leaf_no, &bh);
-                        if (error)
-                                goto out;
-                        leaf = (struct gfs2_leaf *)bh->b_data;
-                        len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
-                        brelse(bh);
-                        error = lc(dip, index, len, leaf_no, data);
-                        if (error)
-                                goto out;
-                        index = (index & ~(len - 1)) + len;
-                } else
-                        index++;
-        }
-        if (index != hsize) {
-                gfs2_consist_inode(dip);
-                error = -EIO;
-        }
-out:
-        kfree(lp);
-        return error;
-}
-/**
 * leaf_dealloc - Deallocate a directory leaf
 * @dip: the directory
 * @index: the hash table offset in the directory
 * @len: the number of pointers to this leaf
 * @leaf_no: the leaf number
- * @data: not used
+ * @leaf_bh: buffer_head for the starting leaf
+ * last_dealloc: 1 if this is the final dealloc for the leaf, else 0
 *
 * Returns: errno
 */
 static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
-                        u64 leaf_no, void *data)
+                        u64 leaf_no, struct buffer_head *leaf_bh,
+                        int last_dealloc)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_leaf *tmp_leaf;
@@ -1887,14 +1815,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
                goto out_qs;
        /*  Count the number of leaves  */
+        bh = leaf_bh;
        for (blk = leaf_no; blk; blk = nblk) {
-                error = get_leaf(dip, blk, &bh);
+                if (blk != leaf_no) {
-                if (error)
+                        error = get_leaf(dip, blk, &bh);
-                        goto out_rlist;
+                        if (error)
+                                goto out_rlist;
+                }
                tmp_leaf = (struct gfs2_leaf *)bh->b_data;
                nblk = be64_to_cpu(tmp_leaf->lf_next);
-                brelse(bh);
+                if (blk != leaf_no)
+                        brelse(bh);
                gfs2_rlist_add(sdp, &rlist, blk);
                l_blocks++;
@@ -1918,13 +1850,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        if (error)
                goto out_rg_gunlock;
+        bh = leaf_bh;
        for (blk = leaf_no; blk; blk = nblk) {
-                error = get_leaf(dip, blk, &bh);
+                if (blk != leaf_no) {
-                if (error)
+                        error = get_leaf(dip, blk, &bh);
-                        goto out_end_trans;
+                        if (error)
+                                goto out_end_trans;
+                }
                tmp_leaf = (struct gfs2_leaf *)bh->b_data;
                nblk = be64_to_cpu(tmp_leaf->lf_next);
-                brelse(bh);
+                if (blk != leaf_no)
+                        brelse(bh);
                gfs2_free_meta(dip, blk, 1);
                gfs2_add_inode_blocks(&dip->i_inode, -1);
@@ -1942,6 +1879,10 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
                goto out_end_trans;
        gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+        /* On the last dealloc, make this a regular file in case we crash.
+           (We don't want to free these blocks a second time.)  */
+        if (last_dealloc)
+                dip->i_inode.i_mode = S_IFREG;
        gfs2_dinode_out(dip, dibh->b_data);
        brelse(dibh);
@@ -1975,29 +1916,67 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct buffer_head *bh;
-        int error;
+        struct gfs2_leaf *leaf;
+        u32 hsize, len;
+        u32 ht_offset, lp_offset, ht_offset_cur = -1;
+        u32 index = 0, next_index;
+        __be64 *lp;
+        u64 leaf_no;
+        int error = 0, last;
-        /* Dealloc on-disk leaves to FREEMETA state */
+        hsize = 1 << dip->i_depth;
-        error = foreach_leaf(dip, leaf_dealloc, NULL);
+        if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
-        if (error)
+                gfs2_consist_inode(dip);
-                return error;
+                return -EIO;
+        }
-        /* Make this a regular file in case we crash.
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
-           (We don't want to free these blocks a second time.)  */
+        if (!lp)
+                return -ENOMEM;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        while (index < hsize) {
-        if (error)
+                lp_offset = index & (sdp->sd_hash_ptrs - 1);
-                return error;
+                ht_offset = index - lp_offset;
-        error = gfs2_meta_inode_buffer(dip, &bh);
+                if (ht_offset_cur != ht_offset) {
-        if (!error) {
+                        error = gfs2_dir_read_data(dip, (char *)lp,
-                gfs2_trans_add_bh(dip->i_gl, bh, 1);
+                                                ht_offset * sizeof(__be64),
-                ((struct gfs2_dinode *)bh->b_data)->di_mode =
+                                                sdp->sd_hash_bsize, 1);
-                                                cpu_to_be32(S_IFREG);
+                        if (error != sdp->sd_hash_bsize) {
-                brelse(bh);
+                                if (error >= 0)
+                                        error = -EIO;
+                                goto out;
+                        }
+                        ht_offset_cur = ht_offset;
+                }
+                leaf_no = be64_to_cpu(lp[lp_offset]);
+                if (leaf_no) {
+                        error = get_leaf(dip, leaf_no, &bh);
+                        if (error)
+                                goto out;
+                        leaf = (struct gfs2_leaf *)bh->b_data;
+                        len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
+                        next_index = (index & ~(len - 1)) + len;
+                        last = ((next_index >= hsize) ? 1 : 0);
+                        error = leaf_dealloc(dip, index, len, leaf_no, bh,
+                                             last);
+                        brelse(bh);
+                        if (error)
+                                goto out;
+                        index = next_index;
+                } else
+                        index++;
        }
-        gfs2_trans_end(sdp);
+        if (index != hsize) {
+                gfs2_consist_inode(dip);
+                error = -EIO;
+        }
+out:
+        kfree(lp);
        return error;
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index a98f644bd3df..e686af11becd 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -22,8 +22,8 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
                          const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-                        const struct gfs2_inode *ip, unsigned int type);
+                        const struct gfs2_inode *ip);
-extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                         filldir_t filldir);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index b5a5e60df0d5..fe9945f2ff72 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -139,7 +139,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct inode *inode;
-        inode = gfs2_ilookup(sb, inum->no_addr);
+        inode = gfs2_ilookup(sb, inum->no_addr, 0);
        if (inode) {
                if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
                        iput(inode);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e48310885c48..a9f5cbe45cd9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -545,18 +545,10 @@ static int gfs2_close(struct inode *inode, struct file *file)
 /**
 * gfs2_fsync - sync the dirty data for a file (across the cluster)
 * @file: the file that points to the dentry (we ignore this)
- * @dentry: the dentry that points to the inode to sync
+ * @datasync: set if we can ignore timestamp changes
 *
- * The VFS will flush "normal" data for us. We only need to worry
+ * The VFS will flush data for us. We only need to worry
- * about metadata here. For journaled data, we just do a log flush
+ * about metadata here.
- * as we can't avoid it. Otherwise we can just bale out if datasync
- * is set. For stuffed inodes we must flush the log in order to
- * ensure that all data is on disk.
- *
- * The call to write_inode_now() is there to write back metadata and
- * the inode itself. It does also try and write the data, but thats
- * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
- * for us.
 *
 * Returns: errno
 */
@@ -565,22 +557,20 @@ static int gfs2_fsync(struct file *file, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
        int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
-        int ret = 0;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        int ret;
-        if (gfs2_is_jdata(GFS2_I(inode))) {
-                gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-                return 0;
-        }
-        if (sync_state != 0) {
+        if (datasync)
-                if (!datasync)
+                sync_state &= ~I_DIRTY_SYNC;
-                        ret = write_inode_now(inode, 0);
-                if (gfs2_is_stuffed(GFS2_I(inode)))
+        if (sync_state) {
-                        gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+                ret = sync_inode_metadata(inode, 1);
+                if (ret)
+                        return ret;
+                gfs2_ail_flush(ip->i_gl);
        }
-        return ret;
+        return 0;
 }
 /**
@@ -826,6 +816,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
        loff_t bytes, max_bytes;
        struct gfs2_alloc *al;
        int error;
+        loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
@@ -833,13 +824,15 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
        if (mode & ~FALLOC_FL_KEEP_SIZE)
                return -EOPNOTSUPP;
-        offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+        offset &= bsize_mask;
-                 sdp->sd_sb.sb_bsize_shift;
        len = next - offset;
        bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
        if (!bytes)
                bytes = UINT_MAX;
+        bytes &= bsize_mask;
+        if (bytes == 0)
+                bytes = sdp->sd_sb.sb_bsize;
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
        error = gfs2_glock_nq(&ip->i_gh);
@@ -870,6 +863,9 @@ retry:
                if (error) {
                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
                                bytes >>= 1;
+                                bytes &= bsize_mask;
+                                if (bytes == 0)
+                                        bytes = sdp->sd_sb.sb_bsize;
                                goto retry;
                        }
                        goto out_qunlock;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7a4fb630a320..2792a790e50b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -143,14 +143,9 @@ static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        /* assert_spin_locked(&gl->gl_spin); */
        if (gl->gl_state == LM_ST_UNLOCKED)
                return 0;
-        if (test_bit(GLF_LFLUSH, &gl->gl_flags))
+        if (!list_empty(&gl->gl_holders))
-                return 0;
-        if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
-            !list_empty(&gl->gl_holders))
                return 0;
        if (glops->go_demote_ok)
                return glops->go_demote_ok(gl);
@@ -158,6 +153,31 @@ static int demote_ok(const struct gfs2_glock *gl)
 }
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+{
+        spin_lock(&lru_lock);
+        if (!list_empty(&gl->gl_lru))
+                list_del_init(&gl->gl_lru);
+        else
+                atomic_inc(&lru_count);
+        list_add_tail(&gl->gl_lru, &lru_list);
+        set_bit(GLF_LRU, &gl->gl_flags);
+        spin_unlock(&lru_lock);
+}
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+{
+        spin_lock(&lru_lock);
+        if (!list_empty(&gl->gl_lru)) {
+                list_del_init(&gl->gl_lru);
+                atomic_dec(&lru_count);
+                clear_bit(GLF_LRU, &gl->gl_flags);
+        }
+        spin_unlock(&lru_lock);
+}
 /**
 * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
@@ -168,24 +188,8 @@ static int demote_ok(const struct gfs2_glock *gl)
 static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
-        if (demote_ok(gl)) {
+        if (demote_ok(gl))
-                spin_lock(&lru_lock);
+                gfs2_glock_add_to_lru(gl);
-                if (!list_empty(&gl->gl_lru))
-                        list_del_init(&gl->gl_lru);
-                else
-                        atomic_inc(&lru_count);
-                list_add_tail(&gl->gl_lru, &lru_list);
-                spin_unlock(&lru_lock);
-        }
-}
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-        spin_lock(&gl->gl_spin);
-        __gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
 }
 /**
@@ -217,12 +221,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
                spin_lock_bucket(gl->gl_hash);
                hlist_bl_del_rcu(&gl->gl_list);
                spin_unlock_bucket(gl->gl_hash);
-                spin_lock(&lru_lock);
+                gfs2_glock_remove_from_lru(gl);
-                if (!list_empty(&gl->gl_lru)) {
-                        list_del_init(&gl->gl_lru);
-                        atomic_dec(&lru_count);
-                }
-                spin_unlock(&lru_lock);
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
                GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
                trace_gfs2_glock_put(gl);
@@ -542,11 +541,6 @@ __acquires(&gl->gl_spin)
        clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
        gfs2_glock_hold(gl);
-        if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
-            gl->gl_state == LM_ST_DEFERRED) &&
-            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
-                lck_flags |= LM_FLAG_TRY_1CB;
        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
                /* lock_dlm */
                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
@@ -648,7 +642,7 @@ static void delete_work_func(struct work_struct *work)
        /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
        if (ip)
-                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+                inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
        else
                inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
        if (inode && !IS_ERR(inode)) {
@@ -1025,6 +1019,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                return -EIO;
+        if (test_bit(GLF_LRU, &gl->gl_flags))
+                gfs2_glock_remove_from_lru(gl);
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
        if ((LM_FLAG_NOEXP & gh->gh_flags) &&
@@ -1082,7 +1079,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
                        fast_path = 1;
        }
-        __gfs2_glock_schedule_for_reclaim(gl);
+        if (!test_bit(GLF_LFLUSH, &gl->gl_flags))
+                __gfs2_glock_schedule_for_reclaim(gl);
        trace_gfs2_glock_queue(gh, 0);
        spin_unlock(&gl->gl_spin);
        if (likely(fast_path))
@@ -1348,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
-static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+                                    struct shrink_control *sc)
 {
        struct gfs2_glock *gl;
        int may_demote;
        int nr_skipped = 0;
+        int nr = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        LIST_HEAD(skipped);
        if (nr == 0)
@@ -1365,6 +1366,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m
        while(nr && !list_empty(&lru_list)) {
                gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
                list_del_init(&gl->gl_lru);
+                clear_bit(GLF_LRU, &gl->gl_flags);
                atomic_dec(&lru_count);
                /* Test for being demotable */
@@ -1387,6 +1389,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m
                }
                nr_skipped++;
                list_add(&gl->gl_lru, &skipped);
+                set_bit(GLF_LRU, &gl->gl_flags);
        }
        list_splice(&skipped, &lru_list);
        atomic_add(nr_skipped, &lru_count);
@@ -1459,12 +1462,7 @@ static void thaw_glock(struct gfs2_glock *gl)
 static void clear_glock(struct gfs2_glock *gl)
 {
-        spin_lock(&lru_lock);
+        gfs2_glock_remove_from_lru(gl);
-        if (!list_empty(&gl->gl_lru)) {
-                list_del_init(&gl->gl_lru);
-                atomic_dec(&lru_count);
-        }
-        spin_unlock(&lru_lock);
        spin_lock(&gl->gl_spin);
        if (gl->gl_state != LM_ST_UNLOCKED)
@@ -1599,9 +1597,11 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
        return 0;
 }
-static const char *gflags2str(char *buf, const unsigned long *gflags)
+static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 {
+        const unsigned long *gflags = &gl->gl_flags;
        char *p = buf;
        if (test_bit(GLF_LOCK, gflags))
                *p++ = 'l';
        if (test_bit(GLF_DEMOTE, gflags))
@@ -1624,6 +1624,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
                *p++ = 'F';
        if (test_bit(GLF_QUEUED, gflags))
                *p++ = 'q';
+        if (test_bit(GLF_LRU, gflags))
+                *p++ = 'L';
+        if (gl->gl_object)
+                *p++ = 'o';
        *p = 0;
        return buf;
 }
@@ -1658,14 +1662,15 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
        dtime *= 1000000/HZ; /* demote time in uSec */
        if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
                dtime = 0;
-        gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
+        gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n",
                  state2str(gl->gl_state),
                  gl->gl_name.ln_type,
                  (unsigned long long)gl->gl_name.ln_number,
-                  gflags2str(gflags_buf, &gl->gl_flags),
+                  gflags2str(gflags_buf, gl),
                  state2str(gl->gl_target),
                  state2str(gl->gl_demote_state), dtime,
                  atomic_read(&gl->gl_ail_count),
+                  atomic_read(&gl->gl_revokes),
                  atomic_read(&gl->gl_ref));
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index aea160690e94..6b2f757b9281 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -225,11 +225,10 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
 extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
 extern void gfs2_glock_free(struct gfs2_glock *gl);
 extern int __init gfs2_glock_init(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 25eeb2bcee47..8ef70f464731 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,33 +28,18 @@
 #include "trans.h"
 /**
- * ail_empty_gl - remove all buffers for a given lock from the AIL
+ * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
 * @gl: the glock
 *
 * None of the buffers should be dirty, locked, or pinned.
 */
-static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+static void __gfs2_ail_flush(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct list_head *head = &gl->gl_ail_list;
        struct gfs2_bufdata *bd;
        struct buffer_head *bh;
-        struct gfs2_trans tr;
-        memset(&tr, 0, sizeof(tr));
-        tr.tr_revokes = atomic_read(&gl->gl_ail_count);
-        if (!tr.tr_revokes)
-                return;
-        /* A shortened, inline version of gfs2_trans_begin() */
-        tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
-        tr.tr_ip = (unsigned long)__builtin_return_address(0);
-        INIT_LIST_HEAD(&tr.tr_list_buf);
-        gfs2_log_reserve(sdp, tr.tr_reserved);
-        BUG_ON(current->journal_info);
-        current->journal_info = &tr;
        spin_lock(&sdp->sd_ail_lock);
        while (!list_empty(head)) {
@@ -76,7 +61,47 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        }
        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
        spin_unlock(&sdp->sd_ail_lock);
+}
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_trans tr;
+        memset(&tr, 0, sizeof(tr));
+        tr.tr_revokes = atomic_read(&gl->gl_ail_count);
+        if (!tr.tr_revokes)
+                return;
+        /* A shortened, inline version of gfs2_trans_begin() */
+        tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
+        tr.tr_ip = (unsigned long)__builtin_return_address(0);
+        INIT_LIST_HEAD(&tr.tr_list_buf);
+        gfs2_log_reserve(sdp, tr.tr_reserved);
+        BUG_ON(current->journal_info);
+        current->journal_info = &tr;
+        __gfs2_ail_flush(gl);
+        gfs2_trans_end(sdp);
+        gfs2_log_flush(sdp, NULL);
+}
+void gfs2_ail_flush(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        unsigned int revokes = atomic_read(&gl->gl_ail_count);
+        int ret;
+        if (!revokes)
+                return;
+        ret = gfs2_trans_begin(sdp, 0, revokes);
+        if (ret)
+                return;
+        __gfs2_ail_flush(gl);
        gfs2_trans_end(sdp);
        gfs2_log_flush(sdp, NULL);
 }
@@ -227,6 +252,119 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl)
 }
 /**
+ * gfs2_set_nlink - Set the inode's link count based on on-disk info
+ * @inode: The inode in question
+ * @nlink: The link count
+ *
+ * If the link count has hit zero, it must never be raised, whatever the
+ * on-disk inode might say. When new struct inodes are created the link
+ * count is set to 1, so that we can safely use this test even when reading
+ * in on disk information for the first time.
+ */
+static void gfs2_set_nlink(struct inode *inode, u32 nlink)
+{
+        /*
+         * We will need to review setting the nlink count here in the
+         * light of the forthcoming ro bind mount work. This is a reminder
+         * to do that.
+         */
+        if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) {
+                if (nlink == 0)
+                        clear_nlink(inode);
+                else
+                        inode->i_nlink = nlink;
+        }
+}
+static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+{
+        const struct gfs2_dinode *str = buf;
+        struct timespec atime;
+        u16 height, depth;
+        if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+                goto corrupt;
+        ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
+        ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
+        ip->i_inode.i_rdev = 0;
+        switch (ip->i_inode.i_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
+                                           be32_to_cpu(str->di_minor));
+                break;
+        };
+        ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
+        ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
+        gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
+        i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
+        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+        atime.tv_sec = be64_to_cpu(str->di_atime);
+        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+        if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+                ip->i_inode.i_atime = atime;
+        ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+        ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+        ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+        ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+        ip->i_goal = be64_to_cpu(str->di_goal_meta);
+        ip->i_generation = be64_to_cpu(str->di_generation);
+        ip->i_diskflags = be32_to_cpu(str->di_flags);
+        gfs2_set_inode_flags(&ip->i_inode);
+        height = be16_to_cpu(str->di_height);
+        if (unlikely(height > GFS2_MAX_META_HEIGHT))
+                goto corrupt;
+        ip->i_height = (u8)height;
+        depth = be16_to_cpu(str->di_depth);
+        if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+                goto corrupt;
+        ip->i_depth = (u8)depth;
+        ip->i_entries = be32_to_cpu(str->di_entries);
+        ip->i_eattr = be64_to_cpu(str->di_eattr);
+        if (S_ISREG(ip->i_inode.i_mode))
+                gfs2_set_aops(&ip->i_inode);
+        return 0;
+corrupt:
+        gfs2_consist_inode(ip);
+        return -EIO;
+}
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+                brelse(dibh);
+                return -EIO;
+        }
+        error = gfs2_dinode_in(ip, dibh->b_data);
+        brelse(dibh);
+        clear_bit(GIF_INVALID, &ip->i_flags);
+        return error;
+}
+/**
 * inode_go_lock - operation done after an inode lock is locked by a process
 * @gl: the glock
 * @flags:
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index b3aa2e3210fd..6fce409b5a50 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,4 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
 extern const struct gfs2_glock_operations *gfs2_glops_list[];
+extern void gfs2_ail_flush(struct gfs2_glock *gl);
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 870a89d6d4dc..0a064e91ac70 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -20,7 +20,6 @@
 #define DIO_WAIT        0x00000010
 #define DIO_METADATA    0x00000020
-#define DIO_ALL         0x00000100
 struct gfs2_log_operations;
 struct gfs2_log_element;
@@ -200,6 +199,8 @@ enum {
        GLF_INITIAL                     = 10,
        GLF_FROZEN                      = 11,
        GLF_QUEUED                      = 12,
+        GLF_LRU                         = 13,
+        GLF_OBJECT                      = 14, /* Used only for tracing */
 };
 struct gfs2_glock {
@@ -234,6 +235,7 @@ struct gfs2_glock {
        struct list_head gl_ail_list;
        atomic_t gl_ail_count;
+        atomic_t gl_revokes;
        struct delayed_work gl_work;
        struct work_struct gl_delete;
        struct rcu_head gl_rcu;
@@ -374,8 +376,6 @@ struct gfs2_ail {
        unsigned int ai_first;
        struct list_head ai_ail1_list;
        struct list_head ai_ail2_list;
-        u64 ai_sync_gen;
 };
 struct gfs2_journal_extent {
@@ -488,7 +488,6 @@ struct gfs2_sb_host {
        char sb_lockproto[GFS2_LOCKNAME_LEN];
        char sb_locktable[GFS2_LOCKNAME_LEN];
-        u8 sb_uuid[16];
 };
 /*
@@ -654,7 +653,6 @@ struct gfs2_sbd {
        spinlock_t sd_ail_lock;
        struct list_head sd_ail1_list;
        struct list_head sd_ail2_list;
-        u64 sd_ail_sync_gen;
        /* Replay stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9134dcb89479..03e0c529063e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,23 +1,25 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
 * of the GNU General Public License version 2.
 */
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
 #include <linux/posix_acl.h>
-#include <linux/sort.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
+#include <linux/fiemap.h>
 #include <linux/security.h>
-#include <linux/time.h>
+#include <asm/uaccess.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -26,19 +28,14 @@
 #include "dir.h"
 #include "xattr.h"
 #include "glock.h"
-#include "glops.h"
 #include "inode.h"
-#include "log.h"
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "super.h"
-struct gfs2_inum_range_host {
+#include "glops.h"
-        u64 ir_start;
-        u64 ir_length;
-};
 struct gfs2_skip_data {
        u64 no_addr;
@@ -74,14 +71,14 @@ static int iget_set(struct inode *inode, void *opaque)
        return 0;
 }
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
 {
        unsigned long hash = (unsigned long)no_addr;
        struct gfs2_skip_data data;
        data.no_addr = no_addr;
        data.skipped = 0;
-        data.non_block = 0;
+        data.non_block = non_block;
        return ilookup5(sb, hash, iget_test, &data);
 }
@@ -248,203 +245,6 @@ fail_iput:
        goto fail;
 }
-static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
-{
-        const struct gfs2_dinode *str = buf;
-        struct timespec atime;
-        u16 height, depth;
-        if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
-                goto corrupt;
-        ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
-        ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
-        ip->i_inode.i_rdev = 0;
-        switch (ip->i_inode.i_mode & S_IFMT) {
-        case S_IFBLK:
-        case S_IFCHR:
-                ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
-                                           be32_to_cpu(str->di_minor));
-                break;
-        };
-        ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
-        ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
-        /*
-         * We will need to review setting the nlink count here in the
-         * light of the forthcoming ro bind mount work. This is a reminder
-         * to do that.
-         */
-        ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-        i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
-        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-        atime.tv_sec = be64_to_cpu(str->di_atime);
-        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
-        if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
-                ip->i_inode.i_atime = atime;
-        ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
-        ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
-        ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
-        ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
-        ip->i_goal = be64_to_cpu(str->di_goal_meta);
-        ip->i_generation = be64_to_cpu(str->di_generation);
-        ip->i_diskflags = be32_to_cpu(str->di_flags);
-        gfs2_set_inode_flags(&ip->i_inode);
-        height = be16_to_cpu(str->di_height);
-        if (unlikely(height > GFS2_MAX_META_HEIGHT))
-                goto corrupt;
-        ip->i_height = (u8)height;
-        depth = be16_to_cpu(str->di_depth);
-        if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
-                goto corrupt;
-        ip->i_depth = (u8)depth;
-        ip->i_entries = be32_to_cpu(str->di_entries);
-        ip->i_eattr = be64_to_cpu(str->di_eattr);
-        if (S_ISREG(ip->i_inode.i_mode))
-                gfs2_set_aops(&ip->i_inode);
-        return 0;
-corrupt:
-        if (gfs2_consist_inode(ip))
-                gfs2_dinode_print(ip);
-        return -EIO;
-}
-/**
- * gfs2_inode_refresh - Refresh the incore copy of the dinode
- * @ip: The GFS2 inode
- *
- * Returns: errno
- */
-int gfs2_inode_refresh(struct gfs2_inode *ip)
-{
-        struct buffer_head *dibh;
-        int error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                return error;
-        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
-                brelse(dibh);
-                return -EIO;
-        }
-        error = gfs2_dinode_in(ip, dibh->b_data);
-        brelse(dibh);
-        clear_bit(GIF_INVALID, &ip->i_flags);
-        return error;
-}
-int gfs2_dinode_dealloc(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al;
-        struct gfs2_rgrpd *rgd;
-        int error;
-        if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
-                if (gfs2_consist_inode(ip))
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
-        al = gfs2_alloc_get(ip);
-        if (!al)
-                return -ENOMEM;
-        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-        if (error)
-                goto out;
-        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-        if (error)
-                goto out_qs;
-        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
-        if (!rgd) {
-                gfs2_consist_inode(ip);
-                error = -EIO;
-                goto out_rindex_relse;
-        }
-        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
-                                   &al->al_rgd_gh);
-        if (error)
-                goto out_rindex_relse;
-        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
-        if (error)
-                goto out_rg_gunlock;
-        set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
-        set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
-        gfs2_free_di(rgd, ip);
-        gfs2_trans_end(sdp);
-out_rg_gunlock:
-        gfs2_glock_dq_uninit(&al->al_rgd_gh);
-out_rindex_relse:
-        gfs2_glock_dq_uninit(&al->al_ri_gh);
-out_qs:
-        gfs2_quota_unhold(ip);
-out:
-        gfs2_alloc_put(ip);
-        return error;
-}
-/**
- * gfs2_change_nlink - Change nlink count on inode
- * @ip: The GFS2 inode
- * @diff: The change in the nlink count required
- *
- * Returns: errno
- */
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
-{
-        struct buffer_head *dibh;
-        u32 nlink;
-        int error;
-        BUG_ON(diff != 1 && diff != -1);
-        nlink = ip->i_inode.i_nlink + diff;
-        /* If we are reducing the nlink count, but the new value ends up being
-           bigger than the old one, we must have underflowed. */
-        if (diff < 0 && nlink > ip->i_inode.i_nlink) {
-                if (gfs2_consist_inode(ip))
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                return error;
-        if (diff > 0)
-                inc_nlink(&ip->i_inode);
-        else
-                drop_nlink(&ip->i_inode);
-        ip->i_inode.i_ctime = CURRENT_TIME;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
-        mark_inode_dirty(&ip->i_inode);
-        if (ip->i_inode.i_nlink == 0)
-                gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
-        return error;
-}
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 {
@@ -543,7 +343,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
        /*  Don't create entries in an unlinked directory  */
        if (!dip->i_inode.i_nlink)
-                return -EPERM;
+                return -ENOENT;
        error = gfs2_dir_check(&dip->i_inode, name, NULL);
        switch (error) {
@@ -613,21 +413,44 @@ out:
        return error;
 }
+static void gfs2_init_dir(struct buffer_head *dibh,
+                          const struct gfs2_inode *parent)
+{
+        struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+        struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+        gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
+        dent->de_inum = di->di_num; /* already GFS2 endian */
+        dent->de_type = cpu_to_be16(DT_DIR);
+        dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+        gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+        gfs2_inum_out(parent, dent);
+        dent->de_type = cpu_to_be16(DT_DIR);
+        
+}
 /**
 * init_dinode - Fill in a new dinode structure
- * @dip: the directory this inode is being created in
+ * @dip: The directory this inode is being created in
 * @gl: The glock covering the new inode
- * @inum: the inode number
+ * @inum: The inode number
- * @mode: the file permissions
+ * @mode: The file permissions
- * @uid:
+ * @uid: The uid of the new inode
- * @gid:
+ * @gid: The gid of the new inode
+ * @generation: The generation number of the new inode
+ * @dev: The device number (if a device node)
+ * @symname: The symlink destination (if a symlink)
+ * @size: The inode size (ignored for directories)
+ * @bhp: The buffer head (returned to caller)
 *
 */
 static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
                        const struct gfs2_inum_host *inum, unsigned int mode,
                        unsigned int uid, unsigned int gid,
-                        const u64 *generation, dev_t dev, struct buffer_head **bhp)
+                        const u64 *generation, dev_t dev, const char *symname,
+                        unsigned size, struct buffer_head **bhp)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_dinode *di;
@@ -646,7 +469,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_uid = cpu_to_be32(uid);
        di->di_gid = cpu_to_be32(gid);
        di->di_nlink = 0;
-        di->di_size = 0;
+        di->di_size = cpu_to_be64(size);
        di->di_blocks = cpu_to_be64(1);
        di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
        di->di_major = cpu_to_be32(MAJOR(dev));
@@ -654,16 +477,6 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
        di->di_generation = cpu_to_be64(*generation);
        di->di_flags = 0;
-        if (S_ISREG(mode)) {
-                if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
-                    gfs2_tune_get(sdp, gt_new_files_jdata))
-                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-        } else if (S_ISDIR(mode)) {
-                di->di_flags |= cpu_to_be32(dip->i_diskflags &
-                                            GFS2_DIF_INHERIT_JDATA);
-        }
        di->__pad1 = 0;
        di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
        di->di_height = 0;
@@ -677,7 +490,26 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
        di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
        memset(&di->di_reserved, 0, sizeof(di->di_reserved));
-        
+        switch(mode & S_IFMT) { 
+        case S_IFREG:
+                if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
+                    gfs2_tune_get(sdp, gt_new_files_jdata))
+                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+                break;
+        case S_IFDIR:
+                di->di_flags |= cpu_to_be32(dip->i_diskflags &
+                                            GFS2_DIF_INHERIT_JDATA);
+                di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+                di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
+                di->di_entries = cpu_to_be32(2);
+                gfs2_init_dir(dibh, dip);
+                break;
+        case S_IFLNK:
+                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size);
+                break;
+        }
        set_buffer_uptodate(dibh);
        *bhp = dibh;
@@ -685,7 +517,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
                       unsigned int mode, const struct gfs2_inum_host *inum,
-                       const u64 *generation, dev_t dev, struct buffer_head **bhp)
+                       const u64 *generation, dev_t dev, const char *symname,
+                       unsigned int size, struct buffer_head **bhp)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        unsigned int uid, gid;
@@ -707,7 +540,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        if (error)
                goto out_quota;
-        init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp);
+        init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp);
        gfs2_quota_change(dip, +1, uid, gid);
        gfs2_trans_end(sdp);
@@ -761,14 +594,16 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                        goto fail_quota_locks;
        }
-        error = gfs2_dir_add(&dip->i_inode, name, ip, IF2DT(ip->i_inode.i_mode));
+        error = gfs2_dir_add(&dip->i_inode, name, ip);
        if (error)
                goto fail_end_trans;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
                goto fail_end_trans;
-        ip->i_inode.i_nlink = 1;
+        inc_nlink(&ip->i_inode);
+        if (S_ISDIR(ip->i_inode.i_mode))
+                inc_nlink(&ip->i_inode);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
@@ -815,27 +650,25 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 }
 /**
- * gfs2_createi - Create a new inode
+ * gfs2_create_inode - Create a new inode
- * @ghs: An array of two holders
+ * @dir: The parent directory
- * @name: The name of the new file
+ * @dentry: The new dentry
- * @mode: the permissions on the new inode
+ * @mode: The permissions on the new inode
- *
+ * @dev: For device nodes, this is the device number
- * @ghs[0] is an initialized holder for the directory
+ * @symname: For symlinks, this is the link destination
- * @ghs[1] is the holder for the inode lock
+ * @size: The initial size of the inode (ignored for directories)
 *
- * If the return value is not NULL, the glocks on both the directory and the new
+ * Returns: 0 on success, or error code
- * file are held.  A transaction has been started and an inplace reservation
- * is held, as well.
- *
- * Returns: An inode
 */
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
-                           unsigned int mode, dev_t dev)
+                             unsigned int mode, dev_t dev, const char *symname,
+                             unsigned int size)
 {
+        const struct qstr *name = &dentry->d_name;
+        struct gfs2_holder ghs[2];
        struct inode *inode = NULL;
-        struct gfs2_inode *dip = ghs->gh_gl->gl_object;
+        struct gfs2_inode *dip = GFS2_I(dir);
-        struct inode *dir = &dip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
        int error;
@@ -843,10 +676,9 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        struct buffer_head *bh = NULL;
        if (!name->len || name->len > GFS2_FNAMESIZE)
-                return ERR_PTR(-ENAMETOOLONG);
+                return -ENAMETOOLONG;
-        gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-        error = gfs2_glock_nq(ghs);
        if (error)
                goto fail;
@@ -864,7 +696,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock;
-        error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh);
+        error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh);
        if (error)
                goto fail_gunlock2;
@@ -891,18 +723,852 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (bh)
                brelse(bh);
-        return inode;
+        gfs2_trans_end(sdp);
+        if (dip->i_alloc->al_rgd)
+                gfs2_inplace_release(dip);
+        gfs2_quota_unlock(dip);
+        gfs2_alloc_put(dip);
+        gfs2_glock_dq_uninit_m(2, ghs);
+        mark_inode_dirty(inode);
+        d_instantiate(dentry, inode);
+        return 0;
 fail_gunlock2:
        gfs2_glock_dq_uninit(ghs + 1);
        if (inode && !IS_ERR(inode))
                iput(inode);
 fail_gunlock:
-        gfs2_glock_dq(ghs);
+        gfs2_glock_dq_uninit(ghs);
 fail:
        if (bh)
                brelse(bh);
-        return ERR_PTR(error);
+        return error;
+}
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+                       int mode, struct nameidata *nd)
+{
+        struct inode *inode;
+        int ret;
+        for (;;) {
+                ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
+                if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
+                        return ret;
+                inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+                if (inode) {
+                        if (!IS_ERR(inode))
+                                break;
+                        return PTR_ERR(inode);
+                }
+        }
+        d_instantiate(dentry, inode);
+        return 0;
+}
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+                                  struct nameidata *nd)
+{
+        struct inode *inode = NULL;
+        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+        if (inode && IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (inode) {
+                struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+                struct gfs2_holder gh;
+                int error;
+                error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+                if (error) {
+                        iput(inode);
+                        return ERR_PTR(error);
+                }
+                gfs2_glock_dq_uninit(&gh);
+                return d_splice_alias(inode, dentry);
+        }
+        d_add(dentry, inode);
+        return NULL;
+}
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+                     struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct inode *inode = old_dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder ghs[2];
+        struct buffer_head *dibh;
+        int alloc_required;
+        int error;
+        if (S_ISDIR(inode->i_mode))
+                return -EPERM;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        error = gfs2_glock_nq(ghs); /* parent */
+        if (error)
+                goto out_parent;
+        error = gfs2_glock_nq(ghs + 1); /* child */
+        if (error)
+                goto out_child;
+        error = -ENOENT;
+        if (inode->i_nlink == 0)
+                goto out_gunlock;
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_dir_check(dir, &dentry->d_name, NULL);
+        switch (error) {
+        case -ENOENT:
+                break;
+        case 0:
+                error = -EEXIST;
+        default:
+                goto out_gunlock;
+        }
+        error = -EINVAL;
+        if (!dip->i_inode.i_nlink)
+                goto out_gunlock;
+        error = -EFBIG;
+        if (dip->i_entries == (u32)-1)
+                goto out_gunlock;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                goto out_gunlock;
+        error = -EINVAL;
+        if (!ip->i_inode.i_nlink)
+                goto out_gunlock;
+        error = -EMLINK;
+        if (ip->i_inode.i_nlink == (u32)-1)
+                goto out_gunlock;
+        alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+        if (error < 0)
+                goto out_gunlock;
+        error = 0;
+        if (alloc_required) {
+                struct gfs2_alloc *al = gfs2_alloc_get(dip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_gunlock;
+                }
+                error = gfs2_quota_lock_check(dip);
+                if (error)
+                        goto out_alloc;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve(dip);
+                if (error)
+                        goto out_gunlock_q;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         gfs2_rg_blocks(al) +
+                                         2 * RES_DINODE + RES_STATFS +
+                                         RES_QUOTA, 0);
+                if (error)
+                        goto out_ipres;
+        } else {
+                error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+                if (error)
+                        goto out_ipres;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_dir_add(dir, &dentry->d_name, ip);
+        if (error)
+                goto out_brelse;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        inc_nlink(&ip->i_inode);
+        ip->i_inode.i_ctime = CURRENT_TIME;
+        gfs2_dinode_out(ip, dibh->b_data);
+        mark_inode_dirty(&ip->i_inode);
+out_brelse:
+        brelse(dibh);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        if (alloc_required)
+                gfs2_inplace_release(dip);
+out_gunlock_q:
+        if (alloc_required)
+                gfs2_quota_unlock(dip);
+out_alloc:
+        if (alloc_required)
+                gfs2_alloc_put(dip);
+out_gunlock:
+        gfs2_glock_dq(ghs + 1);
+out_child:
+        gfs2_glock_dq(ghs);
+out_parent:
+        gfs2_holder_uninit(ghs);
+        gfs2_holder_uninit(ghs + 1);
+        if (!error) {
+                ihold(inode);
+                d_instantiate(dentry, inode);
+                mark_inode_dirty(inode);
+        }
+        return error;
+}
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                          const struct gfs2_inode *ip)
+{
+        int error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                return -EPERM;
+        if ((dip->i_inode.i_mode & S_ISVTX) &&
+            dip->i_inode.i_uid != current_fsuid() &&
+            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (IS_APPEND(&dip->i_inode))
+                return -EPERM;
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+        if (error)
+                return error;
+        error = gfs2_dir_check(&dip->i_inode, name, ip);
+        if (error)
+                return error;
+        return 0;
+}
+/**
+ * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
+ * @dip: The parent directory
+ * @name: The name of the entry in the parent directory
+ * @bh: The inode buffer for the inode to be removed
+ * @inode: The inode to be removed
+ *
+ * Called with all the locks and in a transaction. This will only be
+ * called for a directory after it has been checked to ensure it is empty.
+ *
+ * Returns: 0 on success, or an error
+ */
+static int gfs2_unlink_inode(struct gfs2_inode *dip,
+                             const struct dentry *dentry,
+                             struct buffer_head *bh)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        int error;
+        error = gfs2_dir_del(dip, dentry);
+        if (error)
+                return error;
+        ip->i_entries = 0;
+        inode->i_ctime = CURRENT_TIME;
+        if (S_ISDIR(inode->i_mode))
+                clear_nlink(inode);
+        else
+                drop_nlink(inode);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_dinode_out(ip, bh->b_data);
+        mark_inode_dirty(inode);
+        if (inode->i_nlink == 0)
+                gfs2_unlink_di(inode);
+        return 0;
+}
+/**
+ * gfs2_unlink - Unlink an inode (this does rmdir as well)
+ * @dir: The inode of the directory containing the inode to unlink
+ * @dentry: The file itself
+ *
+ * This routine uses the type of the inode as a flag to figure out
+ * whether this is an unlink or an rmdir.
+ *
+ * Returns: errno
+ */
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *bh;
+        struct gfs2_holder ghs[3];
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_holder ri_gh;
+        int error;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                return error;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+        gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
+        error = gfs2_glock_nq(ghs); /* parent */
+        if (error)
+                goto out_parent;
+        error = gfs2_glock_nq(ghs + 1); /* child */
+        if (error)
+                goto out_child;
+        error = -ENOENT;
+        if (inode->i_nlink == 0)
+                goto out_rgrp;
+        if (S_ISDIR(inode->i_mode)) {
+                error = -ENOTEMPTY;
+                if (ip->i_entries > 2 || inode->i_nlink > 2)
+                        goto out_rgrp;
+        }
+        error = gfs2_glock_nq(ghs + 2); /* rgrp */
+        if (error)
+                goto out_rgrp;
+        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_unlink_inode(dip, dentry, bh);
+        brelse(bh);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq(ghs + 2);
+out_rgrp:
+        gfs2_holder_uninit(ghs + 2);
+        gfs2_glock_dq(ghs + 1);
+out_child:
+        gfs2_holder_uninit(ghs + 1);
+        gfs2_glock_dq(ghs);
+out_parent:
+        gfs2_holder_uninit(ghs);
+        gfs2_glock_dq_uninit(&ri_gh);
+        return error;
+}
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+                        const char *symname)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        unsigned int size;
+        size = strlen(symname);
+        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+                return -ENAMETOOLONG;
+        return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+}
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+}
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @dev: The device specification of the special file
+ *
+ */
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+                      dev_t dev)
+{
+        return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+}
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+        struct inode *dir = &to->i_inode;
+        struct super_block *sb = dir->i_sb;
+        struct inode *tmp;
+        int error = 0;
+        igrab(dir);
+        for (;;) {
+                if (dir == &this->i_inode) {
+                        error = -EINVAL;
+                        break;
+                }
+                if (dir == sb->s_root->d_inode) {
+                        error = 0;
+                        break;
+                }
+                tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
+                if (IS_ERR(tmp)) {
+                        error = PTR_ERR(tmp);
+                        break;
+                }
+                iput(dir);
+                dir = tmp;
+        }
+        iput(dir);
+        return error;
+}
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+                       struct inode *ndir, struct dentry *ndentry)
+{
+        struct gfs2_inode *odip = GFS2_I(odir);
+        struct gfs2_inode *ndip = GFS2_I(ndir);
+        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+        struct gfs2_inode *nip = NULL;
+        struct gfs2_sbd *sdp = GFS2_SB(odir);
+        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+        struct gfs2_rgrpd *nrgd;
+        unsigned int num_gh;
+        int dir_rename = 0;
+        int alloc_required = 0;
+        unsigned int x;
+        int error;
+        if (ndentry->d_inode) {
+                nip = GFS2_I(ndentry->d_inode);
+                if (ip == nip)
+                        return 0;
+        }
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                return error;
+        if (odip != ndip) {
+                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+                                           0, &r_gh);
+                if (error)
+                        goto out;
+                if (S_ISDIR(ip->i_inode.i_mode)) {
+                        dir_rename = 1;
+                        /* don't move a dirctory into it's subdir */
+                        error = gfs2_ok_to_move(ip, ndip);
+                        if (error)
+                                goto out_gunlock_r;
+                }
+        }
+        num_gh = 1;
+        gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        if (odip != ndip) {
+                gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+                num_gh++;
+        }
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+        num_gh++;
+        if (nip) {
+                gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+                num_gh++;
+                /* grab the resource lock for unlink flag twiddling 
+                 * this is the case of the target file already existing
+                 * so we unlink before doing the rename
+                 */
+                nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
+                if (nrgd)
+                        gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
+        }
+        for (x = 0; x < num_gh; x++) {
+                error = gfs2_glock_nq(ghs + x);
+                if (error)
+                        goto out_gunlock;
+        }
+        error = -ENOENT;
+        if (ip->i_inode.i_nlink == 0)
+                goto out_gunlock;
+        /* Check out the old directory */
+        error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        /* Check out the new directory */
+        if (nip) {
+                error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+                if (error)
+                        goto out_gunlock;
+                if (nip->i_inode.i_nlink == 0) {
+                        error = -EAGAIN;
+                        goto out_gunlock;
+                }
+                if (S_ISDIR(nip->i_inode.i_mode)) {
+                        if (nip->i_entries < 2) {
+                                gfs2_consist_inode(nip);
+                                error = -EIO;
+                                goto out_gunlock;
+                        }
+                        if (nip->i_entries > 2) {
+                                error = -ENOTEMPTY;
+                                goto out_gunlock;
+                        }
+                }
+        } else {
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
+                if (error)
+                        goto out_gunlock;
+                error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
+                switch (error) {
+                case -ENOENT:
+                        error = 0;
+                        break;
+                case 0:
+                        error = -EEXIST;
+                default:
+                        goto out_gunlock;
+                };
+                if (odip != ndip) {
+                        if (!ndip->i_inode.i_nlink) {
+                                error = -ENOENT;
+                                goto out_gunlock;
+                        }
+                        if (ndip->i_entries == (u32)-1) {
+                                error = -EFBIG;
+                                goto out_gunlock;
+                        }
+                        if (S_ISDIR(ip->i_inode.i_mode) &&
+                            ndip->i_inode.i_nlink == (u32)-1) {
+                                error = -EMLINK;
+                                goto out_gunlock;
+                        }
+                }
+        }
+        /* Check out the dir to be renamed */
+        if (dir_rename) {
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
+                if (error)
+                        goto out_gunlock;
+        }
+        if (nip == NULL)
+                alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+        error = alloc_required;
+        if (error < 0)
+                goto out_gunlock;
+        error = 0;
+        if (alloc_required) {
+                struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_gunlock;
+                }
+                error = gfs2_quota_lock_check(ndip);
+                if (error)
+                        goto out_alloc;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve_ri(ndip);
+                if (error)
+                        goto out_gunlock_q;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         gfs2_rg_blocks(al) +
+                                         4 * RES_DINODE + 4 * RES_LEAF +
+                                         RES_STATFS + RES_QUOTA + 4, 0);
+                if (error)
+                        goto out_ipreserv;
+        } else {
+                error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+                                         5 * RES_LEAF + 4, 0);
+                if (error)
+                        goto out_gunlock;
+        }
+        /* Remove the target file, if it exists */
+        if (nip) {
+                struct buffer_head *bh;
+                error = gfs2_meta_inode_buffer(nip, &bh);
+                if (error)
+                        goto out_end_trans;
+                error = gfs2_unlink_inode(ndip, ndentry, bh);
+                brelse(bh);
+        }
+        if (dir_rename) {
+                error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
+                if (error)
+                        goto out_end_trans;
+        } else {
+                struct buffer_head *dibh;
+                error = gfs2_meta_inode_buffer(ip, &dibh);
+                if (error)
+                        goto out_end_trans;
+                ip->i_inode.i_ctime = CURRENT_TIME;
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(ip, dibh->b_data);
+                brelse(dibh);
+        }
+        error = gfs2_dir_del(odip, odentry);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+        if (error)
+                goto out_end_trans;
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipreserv:
+        if (alloc_required)
+                gfs2_inplace_release(ndip);
+out_gunlock_q:
+        if (alloc_required)
+                gfs2_quota_unlock(ndip);
+out_alloc:
+        if (alloc_required)
+                gfs2_alloc_put(ndip);
+out_gunlock:
+        while (x--) {
+                gfs2_glock_dq(ghs + x);
+                gfs2_holder_uninit(ghs + x);
+        }
+out_gunlock_r:
+        if (r_gh.gh_gl)
+                gfs2_glock_dq_uninit(&r_gh);
+out:
+        gfs2_glock_dq_uninit(&ri_gh);
+        return error;
+}
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size.
+ *
+ * Returns: 0 on success or error code
+ */
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        struct gfs2_holder i_gh;
+        struct buffer_head *dibh;
+        unsigned int size;
+        char *buf;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+        error = gfs2_glock_nq(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                nd_set_link(nd, ERR_PTR(error));
+                return NULL;
+        }
+        size = (unsigned int)i_size_read(&ip->i_inode);
+        if (size == 0) {
+                gfs2_consist_inode(ip);
+                buf = ERR_PTR(-EIO);
+                goto out;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error) {
+                buf = ERR_PTR(error);
+                goto out;
+        }
+        buf = kzalloc(size + 1, GFP_NOFS);
+        if (!buf)
+                buf = ERR_PTR(-ENOMEM);
+        else
+                memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
+        brelse(dibh);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        nd_set_link(nd, buf);
+        return NULL;
+}
+static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+        char *s = nd_get_link(nd);
+        if (!IS_ERR(s))
+                kfree(s);
+}
+/**
+ * gfs2_permission -
+ * @inode: The inode
+ * @mask: The mask to be tested
+ * @flags: Indicates whether this is an RCU path walk or not
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done.
+ *
+ * Returns: errno
+ */
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
+{
+        struct gfs2_inode *ip;
+        struct gfs2_holder i_gh;
+        int error;
+        int unlock = 0;
+        ip = GFS2_I(inode);
+        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+                if (flags & IPERM_FLAG_RCU)
+                        return -ECHILD;
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                if (error)
+                        return error;
+                unlock = 1;
+        }
+        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+                error = -EACCES;
+        else
+                error = generic_permission(inode, mask, flags, gfs2_check_acl);
+        if (unlock)
+                gfs2_glock_dq_uninit(&i_gh);
+        return error;
 }
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
@@ -928,8 +1594,6 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 * @ip:
 * @attr:
 *
- * Called with a reference on the vnode.
- *
 * Returns: errno
 */
@@ -949,60 +1613,280 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        return error;
 }
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
+static int setattr_chown(struct inode *inode, struct iattr *attr)
-{
+{
-        struct gfs2_dinode *str = buf;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        u32 ouid, ogid, nuid, ngid;
-        str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
+        int error;
-        str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
-        str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
+        ouid = inode->i_uid;
-        str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+        ogid = inode->i_gid;
-        str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
+        nuid = attr->ia_uid;
-        str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
+        ngid = attr->ia_gid;
-        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
-        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
+        if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
-        str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
+                ouid = nuid = NO_QUOTA_CHANGE;
-        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+        if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
-        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
+                ogid = ngid = NO_QUOTA_CHANGE;
-        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
-        str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+        if (!gfs2_alloc_get(ip))
+                return -ENOMEM;
-        str->di_goal_meta = cpu_to_be64(ip->i_goal);
-        str->di_goal_data = cpu_to_be64(ip->i_goal);
+        error = gfs2_quota_lock(ip, nuid, ngid);
-        str->di_generation = cpu_to_be64(ip->i_generation);
+        if (error)
+                goto out_alloc;
-        str->di_flags = cpu_to_be32(ip->i_diskflags);
-        str->di_height = cpu_to_be16(ip->i_height);
+        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-        str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+                error = gfs2_quota_check(ip, nuid, ngid);
-                                             !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
+                if (error)
-                                             GFS2_FORMAT_DE : 0);
+                        goto out_gunlock_q;
-        str->di_depth = cpu_to_be16(ip->i_depth);
+        }
-        str->di_entries = cpu_to_be32(ip->i_entries);
+        error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
-        str->di_eattr = cpu_to_be64(ip->i_eattr);
+        if (error)
-        str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
+                goto out_gunlock_q;
-        str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
-        str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+        error = gfs2_setattr_simple(ip, attr);
-}
+        if (error)
+                goto out_end_trans;
-void gfs2_dinode_print(const struct gfs2_inode *ip)
-{
+        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-        printk(KERN_INFO "  no_formal_ino = %llu\n",
+                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
-               (unsigned long long)ip->i_no_formal_ino);
+                gfs2_quota_change(ip, -blocks, ouid, ogid);
-        printk(KERN_INFO "  no_addr = %llu\n",
+                gfs2_quota_change(ip, blocks, nuid, ngid);
-               (unsigned long long)ip->i_no_addr);
+        }
-        printk(KERN_INFO "  i_size = %llu\n",
-               (unsigned long long)i_size_read(&ip->i_inode));
+out_end_trans:
-        printk(KERN_INFO "  blocks = %llu\n",
+        gfs2_trans_end(sdp);
-               (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
+out_gunlock_q:
-        printk(KERN_INFO "  i_goal = %llu\n",
+        gfs2_quota_unlock(ip);
-               (unsigned long long)ip->i_goal);
+out_alloc:
-        printk(KERN_INFO "  i_diskflags = 0x%.8X\n", ip->i_diskflags);
+        gfs2_alloc_put(ip);
-        printk(KERN_INFO "  i_height = %u\n", ip->i_height);
+        return error;
-        printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
+}
-        printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
-        printk(KERN_INFO "  i_eattr = %llu\n",
+/**
-               (unsigned long long)ip->i_eattr);
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes.  Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return error;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                goto out;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                goto out;
+        if (attr->ia_valid & ATTR_SIZE)
+                error = gfs2_setattr_size(inode, attr->ia_size);
+        else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+                error = setattr_chown(inode, attr);
+        else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+                error = gfs2_acl_chmod(ip, attr);
+        else
+                error = gfs2_setattr_simple(ip, attr);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        if (!error)
+                mark_inode_dirty(inode);
+        return error;
+}
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done. Note that its the NFS
+ * readdirplus operation which causes this to be called (from filldir)
+ * with the glock already held.
+ *
+ * Returns: errno
+ */
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                        struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        int unlock = 0;
+        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+                if (error)
+                        return error;
+                unlock = 1;
+        }
+        generic_fillattr(inode, stat);
+        if (unlock)
+                gfs2_glock_dq_uninit(&gh);
+        return 0;
+}
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+                         const void *data, size_t size, int flags)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int ret;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_setxattr(dentry, name, data, size, flags);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
+}
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+                             void *data, size_t size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int ret;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_getxattr(dentry, name, data, size);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
+}
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int ret;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_removexattr(dentry, name);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
+}
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                       u64 start, u64 len)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int ret;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+        if (ret)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                u64 phys = ip->i_no_addr << inode->i_blkbits;
+                u64 size = i_size_read(inode);
+                u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+                            FIEMAP_EXTENT_DATA_INLINE;
+                phys += sizeof(struct gfs2_dinode);
+                phys += start;
+                if (start + len > size)
+                        len = size - start;
+                if (start < size)
+                        ret = fiemap_fill_next_extent(fieinfo, start, phys,
+                                                      len, flags);
+                if (ret == 1)
+                        ret = 0;
+        } else {
+                ret = __generic_block_fiemap(inode, fieinfo, start, len,
+                                             gfs2_block_map);
+        }
+        gfs2_glock_dq_uninit(&gh);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
 }
+const struct inode_operations gfs2_file_iops = {
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
+};
+const struct inode_operations gfs2_dir_iops = {
+        .create = gfs2_create,
+        .lookup = gfs2_lookup,
+        .link = gfs2_link,
+        .unlink = gfs2_unlink,
+        .symlink = gfs2_symlink,
+        .mkdir = gfs2_mkdir,
+        .rmdir = gfs2_unlink,
+        .mknod = gfs2_mknod,
+        .rename = gfs2_rename,
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
+};
+const struct inode_operations gfs2_symlink_iops = {
+        .readlink = generic_readlink,
+        .follow_link = gfs2_follow_link,
+        .put_link = gfs2_put_link,
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
+};
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 099ca305e518..31606076f701 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -102,22 +102,16 @@ extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
                                         u64 *no_formal_ino,
                                         unsigned int blktype);
-extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
                                  int is_root);
-extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
-                                  const struct qstr *name,
-                                  unsigned int mode, dev_t dev);
 extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-extern void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 5b102c1887fd..903115f2bb34 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/bio.h>
+#include <linux/writeback.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -83,55 +84,97 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 /**
 * gfs2_ail1_start_one - Start I/O on a part of the AIL
 * @sdp: the filesystem
- * @tr: the part of the AIL
+ * @wbc: The writeback control structure
+ * @ai: The ail structure
 *
 */
-static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
+                               struct writeback_control *wbc,
+                               struct gfs2_ail *ai)
 __releases(&sdp->sd_ail_lock)
 __acquires(&sdp->sd_ail_lock)
 {
+        struct gfs2_glock *gl = NULL;
+        struct address_space *mapping;
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
-        int retry;
-        do {
+        list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) {
-                retry = 0;
+                bh = bd->bd_bh;
-                list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+                gfs2_assert(sdp, bd->bd_ail == ai);
-                                                 bd_ail_st_list) {
-                        bh = bd->bd_bh;
-                        gfs2_assert(sdp, bd->bd_ail == ai);
+                if (!buffer_busy(bh)) {
+                        if (!buffer_uptodate(bh))
+                                gfs2_io_error_bh(sdp, bh);
+                        list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+                        continue;
+                }
-                        if (!buffer_busy(bh)) {
+                if (!buffer_dirty(bh))
-                                if (!buffer_uptodate(bh))
+                        continue;
-                                        gfs2_io_error_bh(sdp, bh);
+                if (gl == bd->bd_gl)
-                                list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+                        continue;
-                                continue;
+                gl = bd->bd_gl;
-                        }
+                list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+                mapping = bh->b_page->mapping;
+                if (!mapping)
+                        continue;
+                spin_unlock(&sdp->sd_ail_lock);
+                generic_writepages(mapping, wbc);
+                spin_lock(&sdp->sd_ail_lock);
+                if (wbc->nr_to_write <= 0)
+                        break;
+                return 1;
+        }
-                        if (!buffer_dirty(bh))
+        return 0;
-                                continue;
+}
-                        list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
-                        get_bh(bh);
+/**
-                        spin_unlock(&sdp->sd_ail_lock);
+ * gfs2_ail1_flush - start writeback of some ail1 entries 
-                        lock_buffer(bh);
+ * @sdp: The super block
-                        if (test_clear_buffer_dirty(bh)) {
+ * @wbc: The writeback control structure
-                                bh->b_end_io = end_buffer_write_sync;
+ *
-                                submit_bh(WRITE_SYNC, bh);
+ * Writes back some ail1 entries, according to the limits in the
-                        } else {
+ * writeback control structure
-                                unlock_buffer(bh);
+ */
-                                brelse(bh);
-                        }
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
-                        spin_lock(&sdp->sd_ail_lock);
+{
+        struct list_head *head = &sdp->sd_ail1_list;
-                        retry = 1;
+        struct gfs2_ail *ai;
+        trace_gfs2_ail_flush(sdp, wbc, 1);
+        spin_lock(&sdp->sd_ail_lock);
+restart:
+        list_for_each_entry_reverse(ai, head, ai_list) {
+                if (wbc->nr_to_write <= 0)
                        break;
-                }
+                if (gfs2_ail1_start_one(sdp, wbc, ai))
-        } while (retry);
+                        goto restart;
+        }
+        spin_unlock(&sdp->sd_ail_lock);
+        trace_gfs2_ail_flush(sdp, wbc, 0);
+}
+/**
+ * gfs2_ail1_start - start writeback of all ail1 entries
+ * @sdp: The superblock
+ */
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
+{
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_NONE,
+                .nr_to_write = LONG_MAX,
+                .range_start = 0,
+                .range_end = LLONG_MAX,
+        };
+        return gfs2_ail1_flush(sdp, &wbc);
 }
 /**
@@ -141,7 +184,7 @@ __acquires(&sdp->sd_ail_lock)
 *
 */
-static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
@@ -149,76 +192,63 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
        list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
                                         bd_ail_st_list) {
                bh = bd->bd_bh;
                gfs2_assert(sdp, bd->bd_ail == ai);
+                if (buffer_busy(bh))
-                if (buffer_busy(bh)) {
+                        continue;
-                        if (flags & DIO_ALL)
-                                continue;
-                        else
-                                break;
-                }
                if (!buffer_uptodate(bh))
                        gfs2_io_error_bh(sdp, bh);
                list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
        }
-        return list_empty(&ai->ai_ail1_list);
 }
-static void gfs2_ail1_start(struct gfs2_sbd *sdp)
+/**
-{
+ * gfs2_ail1_empty - Try to empty the ail1 lists
-        struct list_head *head;
+ * @sdp: The superblock
-        u64 sync_gen;
+ *
-        struct gfs2_ail *ai;
+ * Tries to empty the ail1 lists, starting with the oldest first
-        int done = 0;
+ */
-        spin_lock(&sdp->sd_ail_lock);
-        head = &sdp->sd_ail1_list;
-        if (list_empty(head)) {
-                spin_unlock(&sdp->sd_ail_lock);
-                return;
-        }
-        sync_gen = sdp->sd_ail_sync_gen++;
-        while(!done) {
-                done = 1;
-                list_for_each_entry_reverse(ai, head, ai_list) {
-                        if (ai->ai_sync_gen >= sync_gen)
-                                continue;
-                        ai->ai_sync_gen = sync_gen;
-                        gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
-                        done = 0;
-                        break;
-                }
-        }
-        spin_unlock(&sdp->sd_ail_lock);
-}
-static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 {
        struct gfs2_ail *ai, *s;
        int ret;
        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
-                if (gfs2_ail1_empty_one(sdp, ai, flags))
+                gfs2_ail1_empty_one(sdp, ai);
+                if (list_empty(&ai->ai_ail1_list))
                        list_move(&ai->ai_list, &sdp->sd_ail2_list);
-                else if (!(flags & DIO_ALL))
+                else
                        break;
        }
        ret = list_empty(&sdp->sd_ail1_list);
        spin_unlock(&sdp->sd_ail_lock);
        return ret;
 }
+static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
+{
+        struct gfs2_ail *ai;
+        struct gfs2_bufdata *bd;
+        struct buffer_head *bh;
+        spin_lock(&sdp->sd_ail_lock);
+        list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) {
+                list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) {
+                        bh = bd->bd_bh;
+                        if (!buffer_locked(bh))
+                                continue;
+                        get_bh(bh);
+                        spin_unlock(&sdp->sd_ail_lock);
+                        wait_on_buffer(bh);
+                        brelse(bh);
+                        return;
+                }
+        }
+        spin_unlock(&sdp->sd_ail_lock);
+}
 /**
 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
@@ -574,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        set_buffer_uptodate(bh);
        clear_buffer_dirty(bh);
-        gfs2_ail1_empty(sdp, 0);
+        gfs2_ail1_empty(sdp);
        tail = current_tail(sdp);
        lh = (struct gfs2_log_header *)bh->b_data;
@@ -869,9 +899,9 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
        gfs2_log_flush(sdp, NULL);
        for (;;) {
                gfs2_ail1_start(sdp);
-                if (gfs2_ail1_empty(sdp, DIO_ALL))
+                gfs2_ail1_wait(sdp);
+                if (gfs2_ail1_empty(sdp))
                        break;
-                msleep(10);
        }
 }
@@ -905,20 +935,20 @@ int gfs2_logd(void *data)
                preflush = atomic_read(&sdp->sd_log_pinned);
                if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
-                        gfs2_ail1_empty(sdp, DIO_ALL);
+                        gfs2_ail1_empty(sdp);
                        gfs2_log_flush(sdp, NULL);
-                        gfs2_ail1_empty(sdp, DIO_ALL);
                }
                if (gfs2_ail_flush_reqd(sdp)) {
                        gfs2_ail1_start(sdp);
-                        io_schedule();
+                        gfs2_ail1_wait(sdp);
-                        gfs2_ail1_empty(sdp, 0);
+                        gfs2_ail1_empty(sdp);
                        gfs2_log_flush(sdp, NULL);
-                        gfs2_ail1_empty(sdp, DIO_ALL);
                }
-                wake_up(&sdp->sd_log_waitq);
+                if (!gfs2_ail_flush_reqd(sdp))
+                        wake_up(&sdp->sd_log_waitq);
                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
                if (freezing(current))
                        refrigerator();
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 0d007f920234..ab0621698b73 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -12,6 +12,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/writeback.h>
 #include "incore.h"
 /**
@@ -59,6 +60,7 @@ extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
 extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
 extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
 extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 51d27f00ebb4..05bbb124699f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -40,7 +40,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
        struct gfs2_bufdata *bd;
-        gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+        BUG_ON(!current->journal_info);
        clear_buffer_dirty(bh);
        if (test_set_buffer_pinned(bh))
@@ -65,6 +65,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 * @sdp: the filesystem the buffer belongs to
 * @bh: The buffer to unpin
 * @ai:
+ * @flags: The inode dirty flags
 *
 */
@@ -73,10 +74,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 {
        struct gfs2_bufdata *bd = bh->b_private;
-        gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
+        BUG_ON(!buffer_uptodate(bh));
+        BUG_ON(!buffer_pinned(bh));
-        if (!buffer_pinned(bh))
-                gfs2_assert_withdraw(sdp, 0);
        lock_buffer(bh);
        mark_buffer_dirty(bh);
@@ -95,8 +94,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
        spin_unlock(&sdp->sd_ail_lock);
-        if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
+        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
-                gfs2_glock_schedule_for_reclaim(bd->bd_gl);
        trace_gfs2_pin(bd, 0);
        unlock_buffer(bh);
        atomic_dec(&sdp->sd_log_pinned);
@@ -322,12 +320,16 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
+        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_glock *gl = bd->bd_gl;
        struct gfs2_trans *tr;
        tr = current->journal_info;
        tr->tr_touched = 1;
        tr->tr_num_revoke++;
        sdp->sd_log_num_revoke++;
+        atomic_inc(&gl->gl_revokes);
+        set_bit(GLF_LFLUSH, &gl->gl_flags);
        list_add(&le->le_list, &sdp->sd_log_le_revoke);
 }
@@ -350,9 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
        offset = sizeof(struct gfs2_log_descriptor);
-        while (!list_empty(head)) {
+        list_for_each_entry(bd, head, bd_le.le_list) {
-                bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
-                list_del_init(&bd->bd_le.le_list);
                sdp->sd_log_num_revoke--;
                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
@@ -367,8 +367,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
                }
                *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
-                kmem_cache_free(gfs2_bufdata_cachep, bd);
                offset += sizeof(u64);
        }
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
@@ -376,6 +374,22 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        submit_bh(WRITE_SYNC, bh);
 }
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_revoke;
+        struct gfs2_bufdata *bd;
+        struct gfs2_glock *gl;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+                list_del_init(&bd->bd_le.le_list);
+                gl = bd->bd_gl;
+                atomic_dec(&gl->gl_revokes);
+                clear_bit(GLF_LFLUSH, &gl->gl_flags);
+                kmem_cache_free(gfs2_bufdata_cachep, bd);
+        }
+}
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
                                  struct gfs2_log_header_host *head, int pass)
 {
@@ -749,6 +763,7 @@ const struct gfs2_log_operations gfs2_buf_lops = {
 const struct gfs2_log_operations gfs2_revoke_lops = {
        .lo_add = revoke_lo_add,
        .lo_before_commit = revoke_lo_before_commit,
+        .lo_after_commit = revoke_lo_after_commit,
        .lo_before_scan = revoke_lo_before_scan,
        .lo_scan_elements = revoke_lo_scan_elements,
        .lo_after_scan = revoke_lo_after_scan,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 888a5f5a1a58..c2b34cd2abe0 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -53,6 +53,7 @@ static void gfs2_init_glock_once(void *foo)
        INIT_LIST_HEAD(&gl->gl_lru);
        INIT_LIST_HEAD(&gl->gl_ail_list);
        atomic_set(&gl->gl_ail_count, 0);
+        atomic_set(&gl->gl_revokes, 0);
 }
 static void gfs2_init_gl_aspace_once(void *foo)
@@ -145,7 +146,7 @@ static int __init init_gfs2_fs(void)
        gfs2_register_debugfs();
-        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
+        printk("GFS2 installed\n");
        return 0;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 675349b5a133..747238cd9f96 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,6 +31,7 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
@@ -310,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        struct gfs2_bufdata *bd = bh->b_private;
        if (test_clear_buffer_pinned(bh)) {
+                trace_gfs2_pin(bd, 0);
                atomic_dec(&sdp->sd_log_pinned);
                list_del_init(&bd->bd_le.le_list);
                if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 6a1d9ba16411..22c526593131 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -77,8 +77,6 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
 #define buffer_busy(bh) \
 ((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
-#define buffer_in_io(bh) \
-((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
 #endif /* __DIO_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d3c69eb91c74..8ac9ae189b53 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -126,8 +126,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 * changed.
 */
-static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
+static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
 {
+        struct gfs2_sb_host *sb = &sdp->sd_sb;
        if (sb->sb_magic != GFS2_MAGIC ||
            sb->sb_type != GFS2_METATYPE_SB) {
                if (!silent)
@@ -157,8 +159,10 @@ static void end_bio_io_page(struct bio *bio, int error)
        unlock_page(page);
 }
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
 {
+        struct gfs2_sb_host *sb = &sdp->sd_sb;
+        struct super_block *s = sdp->sd_vfs;
        const struct gfs2_sb *str = buf;
        sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
@@ -175,7 +179,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
        memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
        memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-        memcpy(sb->sb_uuid, str->sb_uuid, 16);
+        memcpy(s->s_uuid, str->sb_uuid, 16);
 }
 /**
@@ -197,7 +201,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 * Returns: 0 on success or error
 */
-static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 {
        struct super_block *sb = sdp->sd_vfs;
        struct gfs2_sb *p;
@@ -227,10 +231,10 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
                return -EIO;
        }
        p = kmap(page);
-        gfs2_sb_in(&sdp->sd_sb, p);
+        gfs2_sb_in(sdp, p);
        kunmap(page);
        __free_page(page);
-        return 0;
+        return gfs2_check_sb(sdp, silent);
 }
 /**
@@ -247,17 +251,13 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
        unsigned int x;
        int error;
-        error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+        error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
        if (error) {
                if (!silent)
                        fs_err(sdp, "can't read superblock\n");
                return error;
        }
-        error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-        if (error)
-                return error;
        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
@@ -340,14 +340,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
        /*  Try to autodetect  */
        if (!proto[0] || !table[0]) {
-                error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+                error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
                if (error)
                        return error;
-                error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-                if (error)
-                        goto out;
                if (!proto[0])
                        proto = sdp->sd_sb.sb_lockproto;
                if (!table[0])
@@ -364,7 +360,6 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
        while ((table = strchr(table, '/')))
                *table = '_';
-out:
        return error;
 }
@@ -1119,8 +1114,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        if (sdp->sd_args.ar_statfs_quantum) {
                sdp->sd_tune.gt_statfs_slow = 0;
                sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
-        }
+        } else {
-        else {
                sdp->sd_tune.gt_statfs_slow = 1;
                sdp->sd_tune.gt_statfs_quantum = 30;
        }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
deleted file mode 100644
index 09e436a50723..000000000000
--- a/fs/gfs2/ops_inode.c
+++ /dev/null
@@ -1,1344 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/namei.h>
-#include <linux/mm.h>
-#include <linux/xattr.h>
-#include <linux/posix_acl.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/fiemap.h>
-#include <asm/uaccess.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "bmap.h"
-#include "dir.h"
-#include "xattr.h"
-#include "glock.h"
-#include "inode.h"
-#include "meta_io.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-#include "super.h"
-/**
- * gfs2_create - Create a file
- * @dir: The directory in which to create the file
- * @dentry: The dentry of the new file
- * @mode: The mode of the new file
- *
- * Returns: errno
- */
-static int gfs2_create(struct inode *dir, struct dentry *dentry,
-                       int mode, struct nameidata *nd)
-{
-        struct gfs2_inode *dip = GFS2_I(dir);
-        struct gfs2_sbd *sdp = GFS2_SB(dir);
-        struct gfs2_holder ghs[2];
-        struct inode *inode;
-        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-        for (;;) {
-                inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
-                if (!IS_ERR(inode)) {
-                        gfs2_trans_end(sdp);
-                        if (dip->i_alloc->al_rgd)
-                                gfs2_inplace_release(dip);
-                        gfs2_quota_unlock(dip);
-                        gfs2_alloc_put(dip);
-                        gfs2_glock_dq_uninit_m(2, ghs);
-                        mark_inode_dirty(inode);
-                        break;
-                } else if (PTR_ERR(inode) != -EEXIST ||
-                           (nd && nd->flags & LOOKUP_EXCL)) {
-                        gfs2_holder_uninit(ghs);
-                        return PTR_ERR(inode);
-                }
-                inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-                if (inode) {
-                        if (!IS_ERR(inode)) {
-                                gfs2_holder_uninit(ghs);
-                                break;
-                        } else {
-                                gfs2_holder_uninit(ghs);
-                                return PTR_ERR(inode);
-                        }
-                }
-        }
-        d_instantiate(dentry, inode);
-        return 0;
-}
-/**
- * gfs2_lookup - Look up a filename in a directory and return its inode
- * @dir: The directory inode
- * @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
- *
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
- *
- * Returns: errno
- */
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
-                                  struct nameidata *nd)
-{
-        struct inode *inode = NULL;
-        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-        if (inode && IS_ERR(inode))
-                return ERR_CAST(inode);
-        if (inode) {
-                struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
-                struct gfs2_holder gh;
-                int error;
-                error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-                if (error) {
-                        iput(inode);
-                        return ERR_PTR(error);
-                }
-                gfs2_glock_dq_uninit(&gh);
-                return d_splice_alias(inode, dentry);
-        }
-        d_add(dentry, inode);
-        return NULL;
-}
-/**
- * gfs2_link - Link to a file
- * @old_dentry: The inode to link
- * @dir: Add link to this directory
- * @dentry: The name of the link
- *
- * Link the inode in "old_dentry" into the directory "dir" with the
- * name in "dentry".
- *
- * Returns: errno
- */
-static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
-                     struct dentry *dentry)
-{
-        struct gfs2_inode *dip = GFS2_I(dir);
-        struct gfs2_sbd *sdp = GFS2_SB(dir);
-        struct inode *inode = old_dentry->d_inode;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder ghs[2];
-        int alloc_required;
-        int error;
-        if (S_ISDIR(inode->i_mode))
-                return -EPERM;
-        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-        error = gfs2_glock_nq(ghs); /* parent */
-        if (error)
-                goto out_parent;
-        error = gfs2_glock_nq(ghs + 1); /* child */
-        if (error)
-                goto out_child;
-        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
-        if (error)
-                goto out_gunlock;
-        error = gfs2_dir_check(dir, &dentry->d_name, NULL);
-        switch (error) {
-        case -ENOENT:
-                break;
-        case 0:
-                error = -EEXIST;
-        default:
-                goto out_gunlock;
-        }
-        error = -EINVAL;
-        if (!dip->i_inode.i_nlink)
-                goto out_gunlock;
-        error = -EFBIG;
-        if (dip->i_entries == (u32)-1)
-                goto out_gunlock;
-        error = -EPERM;
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto out_gunlock;
-        error = -EINVAL;
-        if (!ip->i_inode.i_nlink)
-                goto out_gunlock;
-        error = -EMLINK;
-        if (ip->i_inode.i_nlink == (u32)-1)
-                goto out_gunlock;
-        alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
-        if (error < 0)
-                goto out_gunlock;
-        error = 0;
-        if (alloc_required) {
-                struct gfs2_alloc *al = gfs2_alloc_get(dip);
-                if (!al) {
-                        error = -ENOMEM;
-                        goto out_gunlock;
-                }
-                error = gfs2_quota_lock_check(dip);
-                if (error)
-                        goto out_alloc;
-                al->al_requested = sdp->sd_max_dirres;
-                error = gfs2_inplace_reserve(dip);
-                if (error)
-                        goto out_gunlock_q;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                         gfs2_rg_blocks(al) +
-                                         2 * RES_DINODE + RES_STATFS +
-                                         RES_QUOTA, 0);
-                if (error)
-                        goto out_ipres;
-        } else {
-                error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
-                if (error)
-                        goto out_ipres;
-        }
-        error = gfs2_dir_add(dir, &dentry->d_name, ip, IF2DT(inode->i_mode));
-        if (error)
-                goto out_end_trans;
-        error = gfs2_change_nlink(ip, +1);
-out_end_trans:
-        gfs2_trans_end(sdp);
-out_ipres:
-        if (alloc_required)
-                gfs2_inplace_release(dip);
-out_gunlock_q:
-        if (alloc_required)
-                gfs2_quota_unlock(dip);
-out_alloc:
-        if (alloc_required)
-                gfs2_alloc_put(dip);
-out_gunlock:
-        gfs2_glock_dq(ghs + 1);
-out_child:
-        gfs2_glock_dq(ghs);
-out_parent:
-        gfs2_holder_uninit(ghs);
-        gfs2_holder_uninit(ghs + 1);
-        if (!error) {
-                ihold(inode);
-                d_instantiate(dentry, inode);
-                mark_inode_dirty(inode);
-        }
-        return error;
-}
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-                          const struct gfs2_inode *ip)
-{
-        int error;
-        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-                return -EPERM;
-        if ((dip->i_inode.i_mode & S_ISVTX) &&
-            dip->i_inode.i_uid != current_fsuid() &&
-            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-                return -EPERM;
-        if (IS_APPEND(&dip->i_inode))
-                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
-        if (error)
-                return error;
-        error = gfs2_dir_check(&dip->i_inode, name, ip);
-        if (error)
-                return error;
-        return 0;
-}
-/**
- * gfs2_unlink - Unlink a file
- * @dir: The inode of the directory containing the file to unlink
- * @dentry: The file itself
- *
- * Unlink a file.  Call gfs2_unlinki()
- *
- * Returns: errno
- */
-static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct gfs2_inode *dip = GFS2_I(dir);
-        struct gfs2_sbd *sdp = GFS2_SB(dir);
-        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-        struct gfs2_holder ghs[3];
-        struct gfs2_rgrpd *rgd;
-        struct gfs2_holder ri_gh;
-        int error;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                return error;
-        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-        gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
-        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
-        gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-        error = gfs2_glock_nq(ghs); /* parent */
-        if (error)
-                goto out_parent;
-        error = gfs2_glock_nq(ghs + 1); /* child */
-        if (error)
-                goto out_child;
-        error = gfs2_glock_nq(ghs + 2); /* rgrp */
-        if (error)
-                goto out_rgrp;
-        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
-        if (error)
-                goto out_gunlock;
-        error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
-        if (error)
-                goto out_gunlock;
-        error = gfs2_dir_del(dip, &dentry->d_name);
-        if (error)
-                goto out_end_trans;
-        error = gfs2_change_nlink(ip, -1);
-out_end_trans:
-        gfs2_trans_end(sdp);
-out_gunlock:
-        gfs2_glock_dq(ghs + 2);
-out_rgrp:
-        gfs2_holder_uninit(ghs + 2);
-        gfs2_glock_dq(ghs + 1);
-out_child:
-        gfs2_holder_uninit(ghs + 1);
-        gfs2_glock_dq(ghs);
-out_parent:
-        gfs2_holder_uninit(ghs);
-        gfs2_glock_dq_uninit(&ri_gh);
-        return error;
-}
-/**
- * gfs2_symlink - Create a symlink
- * @dir: The directory to create the symlink in
- * @dentry: The dentry to put the symlink in
- * @symname: The thing which the link points to
- *
- * Returns: errno
- */
-static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
-                        const char *symname)
-{
-        struct gfs2_inode *dip = GFS2_I(dir), *ip;
-        struct gfs2_sbd *sdp = GFS2_SB(dir);
-        struct gfs2_holder ghs[2];
-        struct inode *inode;
-        struct buffer_head *dibh;
-        int size;
-        int error;
-        /* Must be stuffed with a null terminator for gfs2_follow_link() */
-        size = strlen(symname);
-        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
-                return -ENAMETOOLONG;
-        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-        inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO, 0);
-        if (IS_ERR(inode)) {
-                gfs2_holder_uninit(ghs);
-                return PTR_ERR(inode);
-        }
-        ip = ghs[1].gh_gl->gl_object;
-        i_size_write(inode, size);
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (!gfs2_assert_withdraw(sdp, !error)) {
-                gfs2_dinode_out(ip, dibh->b_data);
-                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
-                       size);
-                brelse(dibh);
-        }
-        gfs2_trans_end(sdp);
-        if (dip->i_alloc->al_rgd)
-                gfs2_inplace_release(dip);
-        gfs2_quota_unlock(dip);
-        gfs2_alloc_put(dip);
-        gfs2_glock_dq_uninit_m(2, ghs);
-        d_instantiate(dentry, inode);
-        mark_inode_dirty(inode);
-        return 0;
-}
-/**
- * gfs2_mkdir - Make a directory
- * @dir: The parent directory of the new one
- * @dentry: The dentry of the new directory
- * @mode: The mode of the new directory
- *
- * Returns: errno
- */
-static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct gfs2_inode *dip = GFS2_I(dir), *ip;
-        struct gfs2_sbd *sdp = GFS2_SB(dir);
-        struct gfs2_holder ghs[2];
-        struct inode *inode;
-        struct buffer_head *dibh;
-        int error;
-        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-        inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode, 0);
-        if (IS_ERR(inode)) {
-                gfs2_holder_uninit(ghs);
-                return PTR_ERR(inode);
-        }
-        ip = ghs[1].gh_gl->gl_object;
-        ip->i_inode.i_nlink = 2;
-        i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
-        ip->i_diskflags |= GFS2_DIF_JDATA;
-        ip->i_entries = 2;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (!gfs2_assert_withdraw(sdp, !error)) {
-                struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
-                struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
-                dent->de_inum = di->di_num; /* already GFS2 endian */
-                dent->de_type = cpu_to_be16(DT_DIR);
-                di->di_entries = cpu_to_be32(1);
-                dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-                gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
-                gfs2_inum_out(dip, dent);
-                dent->de_type = cpu_to_be16(DT_DIR);
-                gfs2_dinode_out(ip, di);
-                brelse(dibh);
-        }
-        error = gfs2_change_nlink(dip, +1);
-        gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
-        gfs2_trans_end(sdp);
-        if (dip->i_alloc->al_rgd)
-                gfs2_inplace_release(dip);
-        gfs2_quota_unlock(dip);
-        gfs2_alloc_put(dip);
-        gfs2_glock_dq_uninit_m(2, ghs);
-        d_instantiate(dentry, inode);
-        mark_inode_dirty(inode);
-        return 0;
-}
-/**
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-                       struct gfs2_inode *ip)
-{
-        int error;
-        if (ip->i_entries != 2) {
-                if (gfs2_consist_inode(ip))
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
-        error = gfs2_dir_del(dip, name);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(dip, -1);
-        if (error)
-                return error;
-        error = gfs2_dir_del(ip, &gfs2_qdot);
-        if (error)
-                return error;
-        error = gfs2_dir_del(ip, &gfs2_qdotdot);
-        if (error)
-                return error;
-        /* It looks odd, but it really should be done twice */
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        return error;
-}
-/**
- * gfs2_rmdir - Remove a directory
- * @dir: The parent directory of the directory to be removed
- * @dentry: The dentry of the directory to remove
- *
- * Remove a directory. Call gfs2_rmdiri()
- *
- * Returns: errno
- */
-static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct gfs2_inode *dip = GFS2_I(dir);
-        struct gfs2_sbd *sdp = GFS2_SB(dir);
-        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-        struct gfs2_holder ghs[3];
-        struct gfs2_rgrpd *rgd;
-        struct gfs2_holder ri_gh;
-        int error;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                return error;
-        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
-        gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-        error = gfs2_glock_nq(ghs); /* parent */
-        if (error)
-                goto out_parent;
-        error = gfs2_glock_nq(ghs + 1); /* child */
-        if (error)
-                goto out_child;
-        error = gfs2_glock_nq(ghs + 2); /* rgrp */
-        if (error)
-                goto out_rgrp;
-        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
-        if (error)
-                goto out_gunlock;
-        if (ip->i_entries < 2) {
-                if (gfs2_consist_inode(ip))
-                        gfs2_dinode_print(ip);
-                error = -EIO;
-                goto out_gunlock;
-        }
-        if (ip->i_entries > 2) {
-                error = -ENOTEMPTY;
-                goto out_gunlock;
-        }
-        error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
-        if (error)
-                goto out_gunlock;
-        error = gfs2_rmdiri(dip, &dentry->d_name, ip);
-        gfs2_trans_end(sdp);
-out_gunlock:
-        gfs2_glock_dq(ghs + 2);
-out_rgrp:
-        gfs2_holder_uninit(ghs + 2);
-        gfs2_glock_dq(ghs + 1);
-out_child:
-        gfs2_holder_uninit(ghs + 1);
-        gfs2_glock_dq(ghs);
-out_parent:
-        gfs2_holder_uninit(ghs);
-        gfs2_glock_dq_uninit(&ri_gh);
-        return error;
-}
-/**
- * gfs2_mknod - Make a special file
- * @dir: The directory in which the special file will reside
- * @dentry: The dentry of the special file
- * @mode: The mode of the special file
- * @rdev: The device specification of the special file
- *
- */
-static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
-                      dev_t dev)
-{
-        struct gfs2_inode *dip = GFS2_I(dir);
-        struct gfs2_sbd *sdp = GFS2_SB(dir);
-        struct gfs2_holder ghs[2];
-        struct inode *inode;
-        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-        inode = gfs2_createi(ghs, &dentry->d_name, mode, dev);
-        if (IS_ERR(inode)) {
-                gfs2_holder_uninit(ghs);
-                return PTR_ERR(inode);
-        }
-        gfs2_trans_end(sdp);
-        if (dip->i_alloc->al_rgd)
-                gfs2_inplace_release(dip);
-        gfs2_quota_unlock(dip);
-        gfs2_alloc_put(dip);
-        gfs2_glock_dq_uninit_m(2, ghs);
-        d_instantiate(dentry, inode);
-        mark_inode_dirty(inode);
-        return 0;
-}
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
-        struct inode *dir = &to->i_inode;
-        struct super_block *sb = dir->i_sb;
-        struct inode *tmp;
-        int error = 0;
-        igrab(dir);
-        for (;;) {
-                if (dir == &this->i_inode) {
-                        error = -EINVAL;
-                        break;
-                }
-                if (dir == sb->s_root->d_inode) {
-                        error = 0;
-                        break;
-                }
-                tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
-                if (IS_ERR(tmp)) {
-                        error = PTR_ERR(tmp);
-                        break;
-                }
-                iput(dir);
-                dir = tmp;
-        }
-        iput(dir);
-        return error;
-}
-/**
- * gfs2_rename - Rename a file
- * @odir: Parent directory of old file name
- * @odentry: The old dentry of the file
- * @ndir: Parent directory of new file name
- * @ndentry: The new dentry of the file
- *
- * Returns: errno
- */
-static int gfs2_rename(struct inode *odir, struct dentry *odentry,
-                       struct inode *ndir, struct dentry *ndentry)
-{
-        struct gfs2_inode *odip = GFS2_I(odir);
-        struct gfs2_inode *ndip = GFS2_I(ndir);
-        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
-        struct gfs2_inode *nip = NULL;
-        struct gfs2_sbd *sdp = GFS2_SB(odir);
-        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
-        struct gfs2_rgrpd *nrgd;
-        unsigned int num_gh;
-        int dir_rename = 0;
-        int alloc_required = 0;
-        unsigned int x;
-        int error;
-        if (ndentry->d_inode) {
-                nip = GFS2_I(ndentry->d_inode);
-                if (ip == nip)
-                        return 0;
-        }
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                return error;
-        if (odip != ndip) {
-                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
-                                           0, &r_gh);
-                if (error)
-                        goto out;
-                if (S_ISDIR(ip->i_inode.i_mode)) {
-                        dir_rename = 1;
-                        /* don't move a dirctory into it's subdir */
-                        error = gfs2_ok_to_move(ip, ndip);
-                        if (error)
-                                goto out_gunlock_r;
-                }
-        }
-        num_gh = 1;
-        gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-        if (odip != ndip) {
-                gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
-                num_gh++;
-        }
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
-        num_gh++;
-        if (nip) {
-                gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
-                num_gh++;
-                /* grab the resource lock for unlink flag twiddling 
-                 * this is the case of the target file already existing
-                 * so we unlink before doing the rename
-                 */
-                nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
-                if (nrgd)
-                        gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
-        }
-        for (x = 0; x < num_gh; x++) {
-                error = gfs2_glock_nq(ghs + x);
-                if (error)
-                        goto out_gunlock;
-        }
-        /* Check out the old directory */
-        error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
-        if (error)
-                goto out_gunlock;
-        /* Check out the new directory */
-        if (nip) {
-                error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
-                if (error)
-                        goto out_gunlock;
-                if (S_ISDIR(nip->i_inode.i_mode)) {
-                        if (nip->i_entries < 2) {
-                                if (gfs2_consist_inode(nip))
-                                        gfs2_dinode_print(nip);
-                                error = -EIO;
-                                goto out_gunlock;
-                        }
-                        if (nip->i_entries > 2) {
-                                error = -ENOTEMPTY;
-                                goto out_gunlock;
-                        }
-                }
-        } else {
-                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
-                if (error)
-                        goto out_gunlock;
-                error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
-                switch (error) {
-                case -ENOENT:
-                        error = 0;
-                        break;
-                case 0:
-                        error = -EEXIST;
-                default:
-                        goto out_gunlock;
-                };
-                if (odip != ndip) {
-                        if (!ndip->i_inode.i_nlink) {
-                                error = -EINVAL;
-                                goto out_gunlock;
-                        }
-                        if (ndip->i_entries == (u32)-1) {
-                                error = -EFBIG;
-                                goto out_gunlock;
-                        }
-                        if (S_ISDIR(ip->i_inode.i_mode) &&
-                            ndip->i_inode.i_nlink == (u32)-1) {
-                                error = -EMLINK;
-                                goto out_gunlock;
-                        }
-                }
-        }
-        /* Check out the dir to be renamed */
-        if (dir_rename) {
-                error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
-                if (error)
-                        goto out_gunlock;
-        }
-        if (nip == NULL)
-                alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
-        error = alloc_required;
-        if (error < 0)
-                goto out_gunlock;
-        error = 0;
-        if (alloc_required) {
-                struct gfs2_alloc *al = gfs2_alloc_get(ndip);
-                if (!al) {
-                        error = -ENOMEM;
-                        goto out_gunlock;
-                }
-                error = gfs2_quota_lock_check(ndip);
-                if (error)
-                        goto out_alloc;
-                al->al_requested = sdp->sd_max_dirres;
-                error = gfs2_inplace_reserve_ri(ndip);
-                if (error)
-                        goto out_gunlock_q;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                         gfs2_rg_blocks(al) +
-                                         4 * RES_DINODE + 4 * RES_LEAF +
-                                         RES_STATFS + RES_QUOTA + 4, 0);
-                if (error)
-                        goto out_ipreserv;
-        } else {
-                error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
-                                         5 * RES_LEAF + 4, 0);
-                if (error)
-                        goto out_gunlock;
-        }
-        /* Remove the target file, if it exists */
-        if (nip) {
-                if (S_ISDIR(nip->i_inode.i_mode))
-                        error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
-                else {
-                        error = gfs2_dir_del(ndip, &ndentry->d_name);
-                        if (error)
-                                goto out_end_trans;
-                        error = gfs2_change_nlink(nip, -1);
-                }
-                if (error)
-                        goto out_end_trans;
-        }
-        if (dir_rename) {
-                error = gfs2_change_nlink(ndip, +1);
-                if (error)
-                        goto out_end_trans;
-                error = gfs2_change_nlink(odip, -1);
-                if (error)
-                        goto out_end_trans;
-                error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
-                if (error)
-                        goto out_end_trans;
-        } else {
-                struct buffer_head *dibh;
-                error = gfs2_meta_inode_buffer(ip, &dibh);
-                if (error)
-                        goto out_end_trans;
-                ip->i_inode.i_ctime = CURRENT_TIME;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                gfs2_dinode_out(ip, dibh->b_data);
-                brelse(dibh);
-        }
-        error = gfs2_dir_del(odip, &odentry->d_name);
-        if (error)
-                goto out_end_trans;
-        error = gfs2_dir_add(ndir, &ndentry->d_name, ip, IF2DT(ip->i_inode.i_mode));
-        if (error)
-                goto out_end_trans;
-out_end_trans:
-        gfs2_trans_end(sdp);
-out_ipreserv:
-        if (alloc_required)
-                gfs2_inplace_release(ndip);
-out_gunlock_q:
-        if (alloc_required)
-                gfs2_quota_unlock(ndip);
-out_alloc:
-        if (alloc_required)
-                gfs2_alloc_put(ndip);
-out_gunlock:
-        while (x--) {
-                gfs2_glock_dq(ghs + x);
-                gfs2_holder_uninit(ghs + x);
-        }
-out_gunlock_r:
-        if (r_gh.gh_gl)
-                gfs2_glock_dq_uninit(&r_gh);
-out:
-        gfs2_glock_dq_uninit(&ri_gh);
-        return error;
-}
-/**
- * gfs2_follow_link - Follow a symbolic link
- * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
- *
- * This can handle symlinks of any size.
- *
- * Returns: 0 on success or error code
- */
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-        struct gfs2_holder i_gh;
-        struct buffer_head *dibh;
-        unsigned int x, size;
-        char *buf;
-        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq(&i_gh);
-        if (error) {
-                gfs2_holder_uninit(&i_gh);
-                nd_set_link(nd, ERR_PTR(error));
-                return NULL;
-        }
-        size = (unsigned int)i_size_read(&ip->i_inode);
-        if (size == 0) {
-                gfs2_consist_inode(ip);
-                buf = ERR_PTR(-EIO);
-                goto out;
-        }
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error) {
-                buf = ERR_PTR(error);
-                goto out;
-        }
-        x = size + 1;
-        buf = kmalloc(x, GFP_NOFS);
-        if (!buf)
-                buf = ERR_PTR(-ENOMEM);
-        else
-                memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-        brelse(dibh);
-out:
-        gfs2_glock_dq_uninit(&i_gh);
-        nd_set_link(nd, buf);
-        return NULL;
-}
-static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-        char *s = nd_get_link(nd);
-        if (!IS_ERR(s))
-                kfree(s);
-}
-/**
- * gfs2_permission -
- * @inode: The inode
- * @mask: The mask to be tested
- * @flags: Indicates whether this is an RCU path walk or not
- *
- * This may be called from the VFS directly, or from within GFS2 with the
- * inode locked, so we look to see if the glock is already locked and only
- * lock the glock if its not already been done.
- *
- * Returns: errno
- */
-int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
-{
-        struct gfs2_inode *ip;
-        struct gfs2_holder i_gh;
-        int error;
-        int unlock = 0;
-        ip = GFS2_I(inode);
-        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-                if (flags & IPERM_FLAG_RCU)
-                        return -ECHILD;
-                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-                if (error)
-                        return error;
-                unlock = 1;
-        }
-        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
-                error = -EACCES;
-        else
-                error = generic_permission(inode, mask, flags, gfs2_check_acl);
-        if (unlock)
-                gfs2_glock_dq_uninit(&i_gh);
-        return error;
-}
-static int setattr_chown(struct inode *inode, struct iattr *attr)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        u32 ouid, ogid, nuid, ngid;
-        int error;
-        ouid = inode->i_uid;
-        ogid = inode->i_gid;
-        nuid = attr->ia_uid;
-        ngid = attr->ia_gid;
-        if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
-                ouid = nuid = NO_QUOTA_CHANGE;
-        if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
-                ogid = ngid = NO_QUOTA_CHANGE;
-        if (!gfs2_alloc_get(ip))
-                return -ENOMEM;
-        error = gfs2_quota_lock(ip, nuid, ngid);
-        if (error)
-                goto out_alloc;
-        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-                error = gfs2_quota_check(ip, nuid, ngid);
-                if (error)
-                        goto out_gunlock_q;
-        }
-        error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
-        if (error)
-                goto out_gunlock_q;
-        error = gfs2_setattr_simple(ip, attr);
-        if (error)
-                goto out_end_trans;
-        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
-                gfs2_quota_change(ip, -blocks, ouid, ogid);
-                gfs2_quota_change(ip, blocks, nuid, ngid);
-        }
-out_end_trans:
-        gfs2_trans_end(sdp);
-out_gunlock_q:
-        gfs2_quota_unlock(ip);
-out_alloc:
-        gfs2_alloc_put(ip);
-        return error;
-}
-/**
- * gfs2_setattr - Change attributes on an inode
- * @dentry: The dentry which is changing
- * @attr: The structure describing the change
- *
- * The VFS layer wants to change one or more of an inodes attributes.  Write
- * that change out to disk.
- *
- * Returns: errno
- */
-static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
-{
-        struct inode *inode = dentry->d_inode;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder i_gh;
-        int error;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-        if (error)
-                return error;
-        error = -EPERM;
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto out;
-        error = inode_change_ok(inode, attr);
-        if (error)
-                goto out;
-        if (attr->ia_valid & ATTR_SIZE)
-                error = gfs2_setattr_size(inode, attr->ia_size);
-        else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
-                error = setattr_chown(inode, attr);
-        else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
-                error = gfs2_acl_chmod(ip, attr);
-        else
-                error = gfs2_setattr_simple(ip, attr);
-out:
-        gfs2_glock_dq_uninit(&i_gh);
-        if (!error)
-                mark_inode_dirty(inode);
-        return error;
-}
-/**
- * gfs2_getattr - Read out an inode's attributes
- * @mnt: The vfsmount the inode is being accessed from
- * @dentry: The dentry to stat
- * @stat: The inode's stats
- *
- * This may be called from the VFS directly, or from within GFS2 with the
- * inode locked, so we look to see if the glock is already locked and only
- * lock the glock if its not already been done. Note that its the NFS
- * readdirplus operation which causes this to be called (from filldir)
- * with the glock already held.
- *
- * Returns: errno
- */
-static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
-                        struct kstat *stat)
-{
-        struct inode *inode = dentry->d_inode;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int error;
-        int unlock = 0;
-        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-                if (error)
-                        return error;
-                unlock = 1;
-        }
-        generic_fillattr(inode, stat);
-        if (unlock)
-                gfs2_glock_dq_uninit(&gh);
-        return 0;
-}
-static int gfs2_setxattr(struct dentry *dentry, const char *name,
-                         const void *data, size_t size, int flags)
-{
-        struct inode *inode = dentry->d_inode;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int ret;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        ret = gfs2_glock_nq(&gh);
-        if (ret == 0) {
-                ret = generic_setxattr(dentry, name, data, size, flags);
-                gfs2_glock_dq(&gh);
-        }
-        gfs2_holder_uninit(&gh);
-        return ret;
-}
-static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
-                             void *data, size_t size)
-{
-        struct inode *inode = dentry->d_inode;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int ret;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-        ret = gfs2_glock_nq(&gh);
-        if (ret == 0) {
-                ret = generic_getxattr(dentry, name, data, size);
-                gfs2_glock_dq(&gh);
-        }
-        gfs2_holder_uninit(&gh);
-        return ret;
-}
-static int gfs2_removexattr(struct dentry *dentry, const char *name)
-{
-        struct inode *inode = dentry->d_inode;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int ret;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        ret = gfs2_glock_nq(&gh);
-        if (ret == 0) {
-                ret = generic_removexattr(dentry, name);
-                gfs2_glock_dq(&gh);
-        }
-        gfs2_holder_uninit(&gh);
-        return ret;
-}
-static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-                       u64 start, u64 len)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int ret;
-        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
-        if (ret)
-                return ret;
-        mutex_lock(&inode->i_mutex);
-        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-        if (ret)
-                goto out;
-        if (gfs2_is_stuffed(ip)) {
-                u64 phys = ip->i_no_addr << inode->i_blkbits;
-                u64 size = i_size_read(inode);
-                u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
-                            FIEMAP_EXTENT_DATA_INLINE;
-                phys += sizeof(struct gfs2_dinode);
-                phys += start;
-                if (start + len > size)
-                        len = size - start;
-                if (start < size)
-                        ret = fiemap_fill_next_extent(fieinfo, start, phys,
-                                                      len, flags);
-                if (ret == 1)
-                        ret = 0;
-        } else {
-                ret = __generic_block_fiemap(inode, fieinfo, start, len,
-                                             gfs2_block_map);
-        }
-        gfs2_glock_dq_uninit(&gh);
-out:
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
-const struct inode_operations gfs2_file_iops = {
-        .permission = gfs2_permission,
-        .setattr = gfs2_setattr,
-        .getattr = gfs2_getattr,
-        .setxattr = gfs2_setxattr,
-        .getxattr = gfs2_getxattr,
-        .listxattr = gfs2_listxattr,
-        .removexattr = gfs2_removexattr,
-        .fiemap = gfs2_fiemap,
-};
-const struct inode_operations gfs2_dir_iops = {
-        .create = gfs2_create,
-        .lookup = gfs2_lookup,
-        .link = gfs2_link,
-        .unlink = gfs2_unlink,
-        .symlink = gfs2_symlink,
-        .mkdir = gfs2_mkdir,
-        .rmdir = gfs2_rmdir,
-        .mknod = gfs2_mknod,
-        .rename = gfs2_rename,
-        .permission = gfs2_permission,
-        .setattr = gfs2_setattr,
-        .getattr = gfs2_getattr,
-        .setxattr = gfs2_setxattr,
-        .getxattr = gfs2_getxattr,
-        .listxattr = gfs2_listxattr,
-        .removexattr = gfs2_removexattr,
-        .fiemap = gfs2_fiemap,
-};
-const struct inode_operations gfs2_symlink_iops = {
-        .readlink = generic_readlink,
-        .follow_link = gfs2_follow_link,
-        .put_link = gfs2_put_link,
-        .permission = gfs2_permission,
-        .setattr = gfs2_setattr,
-        .getattr = gfs2_getattr,
-        .setxattr = gfs2_setxattr,
-        .getxattr = gfs2_getxattr,
-        .listxattr = gfs2_listxattr,
-        .removexattr = gfs2_removexattr,
-        .fiemap = gfs2_fiemap,
-};
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e23d9864c418..42e8d23bc047 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -38,6 +38,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
 {
        struct gfs2_quota_data *qd;
        struct gfs2_sbd *sdp;
+        int nr_to_scan = sc->nr_to_scan;
-        if (nr == 0)
+        if (nr_to_scan == 0)
                goto out;
-        if (!(gfp_mask & __GFP_FS))
+        if (!(sc->gfp_mask & __GFP_FS))
                return -1;
        spin_lock(&qd_lru_lock);
-        while (nr && !list_empty(&qd_lru_list)) {
+        while (nr_to_scan && !list_empty(&qd_lru_list)) {
                qd = list_entry(qd_lru_list.next,
                                struct gfs2_quota_data, qd_reclaim);
                sdp = qd->qd_gl->gl_sbd;
@@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
                spin_unlock(&qd_lru_lock);
                kmem_cache_free(gfs2_quotad_cachep, qd);
                spin_lock(&qd_lru_lock);
-                nr--;
+                nr_to_scan--;
        }
        spin_unlock(&qd_lru_lock);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e7d236ca48bd..90bf1c302a98 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -12,6 +12,7 @@
 struct gfs2_inode;
 struct gfs2_sbd;
+struct shrink_control;
 #define NO_QUOTA_CHANGE ((u32)-1)
@@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
        return ret;
 }
-extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
+                                 struct shrink_control *sc);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6fcae8469f6d..9b780df3fd54 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -78,10 +78,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
                               unsigned char *buf2, unsigned int offset,
-                               unsigned int buflen, u32 block,
+                               struct gfs2_bitmap *bi, u32 block,
                               unsigned char new_state)
 {
        unsigned char *byte1, *byte2, *end, cur_state;
+        unsigned int buflen = bi->bi_len;
        const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
        byte1 = buf1 + offset + (block / GFS2_NBBY);
@@ -92,6 +93,16 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
        cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
        if (unlikely(!valid_change[new_state * 4 + cur_state])) {
+                printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, "
+                       "new_state=%d\n",
+                       (unsigned long long)block, cur_state, new_state);
+                printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n",
+                       (unsigned long long)rgd->rd_addr,
+                       (unsigned long)bi->bi_start);
+                printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n",
+                       (unsigned long)bi->bi_offset,
+                       (unsigned long)bi->bi_len);
+                dump_stack();
                gfs2_consist_rgrpd(rgd);
                return;
        }
@@ -381,6 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
                if (gl) {
                        gl->gl_object = NULL;
+                        gfs2_glock_add_to_lru(gl);
                        gfs2_glock_put(gl);
                }
@@ -1365,7 +1377,7 @@ skip:
        gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-                    bi->bi_len, blk, new_state);
+                    bi, blk, new_state);
        goal = blk;
        while (*n < elen) {
                goal++;
@@ -1375,7 +1387,7 @@ skip:
                    GFS2_BLKST_FREE)
                        break;
                gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-                            bi->bi_len, goal, new_state);
+                            bi, goal, new_state);
                (*n)++;
        }
 out:
@@ -1432,7 +1444,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
                }
                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
                gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
-                            bi->bi_len, buf_blk, new_state);
+                            bi, buf_blk, new_state);
        }
        return rgd;
@@ -1617,6 +1629,10 @@ void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+        /* Directories keep their data in the metadata address space */
+        if (ip->i_depth)
+                gfs2_meta_wipe(ip, bstart, blen);
 }
 /**
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b9f28e66dad1..ed540e7018be 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -23,6 +23,7 @@
 #include <linux/time.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -700,11 +701,47 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
        mutex_unlock(&sdp->sd_freeze_lock);
 }
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
+{
+        struct gfs2_dinode *str = buf;
+        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
+        str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
+        str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
+        str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+        str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
+        str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
+        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
+        str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
+        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
+        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
+        str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+        str->di_goal_meta = cpu_to_be64(ip->i_goal);
+        str->di_goal_data = cpu_to_be64(ip->i_goal);
+        str->di_generation = cpu_to_be64(ip->i_generation);
+        str->di_flags = cpu_to_be32(ip->i_diskflags);
+        str->di_height = cpu_to_be16(ip->i_height);
+        str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+                                             !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
+                                             GFS2_FORMAT_DE : 0);
+        str->di_depth = cpu_to_be16(ip->i_depth);
+        str->di_entries = cpu_to_be32(ip->i_entries);
+        str->di_eattr = cpu_to_be64(ip->i_eattr);
+        str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
+        str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
+        str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+}
 /**
 * gfs2_write_inode - Make sure the inode is stable on the disk
 * @inode: The inode
- * @sync: synchronous write flag
+ * @wbc: The writeback control structure
 *
 * Returns: errno
 */
@@ -713,15 +750,17 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
+        struct backing_dev_info *bdi = metamapping->backing_dev_info;
        struct gfs2_holder gh;
        struct buffer_head *bh;
        struct timespec atime;
        struct gfs2_dinode *di;
-        int ret = 0;
+        int ret = -EAGAIN;
-        /* Check this is a "normal" inode, etc */
+        /* Skip timestamp update, if this is from a memalloc */
        if (current->flags & PF_MEMALLOC)
-                return 0;
+                goto do_flush;
        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
        if (ret)
                goto do_flush;
@@ -745,6 +784,13 @@ do_unlock:
 do_flush:
        if (wbc->sync_mode == WB_SYNC_ALL)
                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        filemap_fdatawrite(metamapping);
+        if (bdi->dirty_exceeded)
+                gfs2_ail1_flush(sdp, wbc);
+        if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
+                ret = filemap_fdatawait(metamapping);
+        if (ret)
+                mark_inode_dirty_sync(inode);
        return ret;
 }
@@ -874,8 +920,9 @@ restart:
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
-        if (wait && sb->s_fs_info)
+        struct gfs2_sbd *sdp = sb->s_fs_info;
-                gfs2_log_flush(sb->s_fs_info, NULL);
+        if (wait && sdp)
+                gfs2_log_flush(sdp, NULL);
        return 0;
 }
@@ -1308,6 +1355,78 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        return 0;
 }
+static void gfs2_final_release_pages(struct gfs2_inode *ip)
+{
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_glock *gl = ip->i_gl;
+        truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+        truncate_inode_pages(&inode->i_data, 0);
+        if (atomic_read(&gl->gl_revokes) == 0) {
+                clear_bit(GLF_LFLUSH, &gl->gl_flags);
+                clear_bit(GLF_DIRTY, &gl->gl_flags);
+        }
+}
+static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al;
+        struct gfs2_rgrpd *rgd;
+        int error;
+        if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+        if (error)
+                goto out_qs;
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+        if (!rgd) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out_rindex_relse;
+        }
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+                                   &al->al_rgd_gh);
+        if (error)
+                goto out_rindex_relse;
+        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
+                                 sdp->sd_jdesc->jd_blocks);
+        if (error)
+                goto out_rg_gunlock;
+        gfs2_free_di(rgd, ip);
+        gfs2_final_release_pages(ip);
+        gfs2_trans_end(sdp);
+out_rg_gunlock:
+        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+        gfs2_quota_unhold(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
 /*
 * We have to (at the moment) hold the inodes main lock to cover
 * the gap between unlocking the shared lock on the iopen lock and
@@ -1371,15 +1490,13 @@ static void gfs2_evict_inode(struct inode *inode)
        }
        error = gfs2_dinode_dealloc(ip);
-        if (error)
+        goto out_unlock;
-                goto out_unlock;
 out_truncate:
        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
        if (error)
                goto out_unlock;
-        /* Needs to be done before glock release & also in a transaction */
+        gfs2_final_release_pages(ip);
-        truncate_inode_pages(&inode->i_data, 0);
        gfs2_trans_end(sdp);
 out_unlock:
@@ -1394,6 +1511,7 @@ out:
        end_writeback(inode);
        ip->i_gl->gl_object = NULL;
+        gfs2_glock_add_to_lru(ip->i_gl);
        gfs2_glock_put(ip->i_gl);
        ip->i_gl = NULL;
        if (ip->i_iopen_gh.gh_gl) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 748ccb557c18..e20eab37bc80 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -81,7 +81,8 @@ static int gfs2_uuid_valid(const u8 *uuid)
 static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
 {
-        const u8 *uuid = sdp->sd_sb.sb_uuid;
+        struct super_block *s = sdp->sd_vfs;
+        const u8 *uuid = s->s_uuid;
        buf[0] = '\0';
        if (!gfs2_uuid_valid(uuid))
                return 0;
@@ -616,7 +617,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
                       struct kobj_uevent_env *env)
 {
        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-        const u8 *uuid = sdp->sd_sb.sb_uuid;
+        struct super_block *s = sdp->sd_vfs;
+        const u8 *uuid = s->s_uuid;
        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index cedb0bb96d96..5d07609ec57d 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -10,6 +10,7 @@
 #include <linux/buffer_head.h>
 #include <linux/dlmconstants.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/writeback.h>
 #include "incore.h"
 #include "glock.h"
@@ -40,7 +41,9 @@
        {(1UL << GLF_REPLY_PENDING),            "r" },          \
        {(1UL << GLF_INITIAL),                  "I" },          \
        {(1UL << GLF_FROZEN),                   "F" },          \
-        {(1UL << GLF_QUEUED),                   "q" })
+        {(1UL << GLF_QUEUED),                   "q" },          \
+        {(1UL << GLF_LRU),                      "L" },          \
+        {(1UL << GLF_OBJECT),                   "o" })
 #ifndef NUMPTY
 #define NUMPTY
@@ -94,7 +97,7 @@ TRACE_EVENT(gfs2_glock_state_change,
                __entry->new_state      = glock_trace_state(new_state);
                __entry->tgt_state      = glock_trace_state(gl->gl_target);
                __entry->dmt_state      = glock_trace_state(gl->gl_demote_state);
-                __entry->flags          = gl->gl_flags;
+                __entry->flags          = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
        ),
        TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
@@ -127,7 +130,7 @@ TRACE_EVENT(gfs2_glock_put,
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
-                __entry->flags          = gl->gl_flags;
+                __entry->flags          = gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
        ),
        TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
@@ -161,7 +164,7 @@ TRACE_EVENT(gfs2_demote_rq,
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
                __entry->dmt_state      = glock_trace_state(gl->gl_demote_state);
-                __entry->flags          = gl->gl_flags;
+                __entry->flags          = gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
        ),
        TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
@@ -318,6 +321,33 @@ TRACE_EVENT(gfs2_log_blocks,
                  MINOR(__entry->dev), __entry->blocks)
 );
+/* Writing back the AIL */
+TRACE_EVENT(gfs2_ail_flush,
+        TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start),
+        TP_ARGS(sdp, wbc, start),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        int, start                      )
+                __field(        int, sync_mode                  )
+                __field(        long, nr_to_write               )
+        ),
+        TP_fast_assign(
+                __entry->dev            = sdp->sd_vfs->s_dev;
+                __entry->start          = start;
+                __entry->sync_mode      = wbc->sync_mode;
+                __entry->nr_to_write    = wbc->nr_to_write;
+        ),
+        TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev),
+                  MINOR(__entry->dev), __entry->start ? "start" : "end",
+                  __entry->sync_mode == WB_SYNC_ALL ? "all" : "none",
+                  __entry->nr_to_write)
+);
 /* Section 3 - bmap
 *
 * Objectives:
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..1cb70cdba2c1 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int res;
+        if (S_ISDIR(inode->i_mode))
+                dentry_unhash(dentry);
        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
+                if (S_ISDIR(new_dentry->d_inode->i_mode))
+                        dentry_unhash(new_dentry);
                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4df5059c25da..b28835091dd0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int res;
+        dentry_unhash(dentry);
        if (inode->i_size != 2)
                return -ENOTEMPTY;
@@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                if (S_ISDIR(new_dentry->d_inode->i_mode))
+                if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+                        dentry_unhash(new_dentry);
                        res = hfsplus_rmdir(new_dir, new_dentry);
-                else
+                } else {
                        res = hfsplus_unlink(new_dir, new_dentry);
+                }
                if (res)
                        return res;
        }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834ed28..e6816b9e6903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
        char *file;
        int err;
+        dentry_unhash(dentry);
        if ((file = dentry_name(dentry)) == NULL)
                return -ENOMEM;
        err = do_rmdir(file);
@@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        char *from_name, *to_name;
        int err;
+        if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
+                dentry_unhash(to);
        if ((from_name = dentry_name(from)) == NULL)
                return -ENOMEM;
        if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1f05839c27a7..ff0ce21c0867 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -395,7 +395,6 @@ again:
                dentry_unhash(dentry);
                if (!d_unhashed(dentry)) {
-                        dput(dentry);
                        hpfs_unlock(dir->i_sb);
                        return -ENOSPC;
                }
@@ -403,7 +402,6 @@ again:
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
                        d_rehash(dentry);
-                        dput(dentry);
                } else {
                        struct iattr newattrs;
                        /*printk("HPFS: truncating file before delete.\n");*/
@@ -411,7 +409,6 @@ again:
                        newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
                        err = notify_change(dentry, &newattrs);
                        put_write_access(inode);
-                        dput(dentry);
                        if (!err)
                                goto again;
                }
@@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err;
        int r;
+        dentry_unhash(dentry);
        hpfs_adjust_length(name, &len);
        hpfs_lock(dir->i_sb);
        err = -ENOENT;
@@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh;
        struct fnode *fnode;
        int err;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        if ((err = hpfs_chk_name(new_name, &new_len))) return err;
        err = 0;
        hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9eeb1cd03ff..7aafeb8fa300 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
        pgoff = offset >> PAGE_SHIFT;
        i_size_write(inode, offset);
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        if (!prio_tree_empty(&mapping->i_mmap))
                hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        truncate_hugepages(inode, offset);
        return 0;
 }
@@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+struct file *hugetlb_file_setup(const char *name, size_t size,
+                                vm_flags_t acctflag,
                                struct user_struct **user, int creat_flags)
 {
        int error = -ENOMEM;
diff --git a/fs/inode.c b/fs/inode.c
index 33c963d08ab4..990d284877a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/prefetch.h>
 #include <linux/ima.h>
 #include <linux/cred.h>
 #include "internal.h"
@@ -325,12 +326,11 @@ void address_space_init_once(struct address_space *mapping)
        memset(mapping, 0, sizeof(*mapping));
        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
        spin_lock_init(&mapping->tree_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
+        mutex_init(&mapping->i_mmap_mutex);
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-        mutex_init(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(address_space_init_once);
@@ -751,8 +751,12 @@ static void prune_icache(int nr_to_scan)
 * This function is passed the number of inodes to scan, and it returns the
 * total number of remaining possibly-reclaimable inodes.
 */
-static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink,
+                                struct shrink_control *sc)
 {
+        int nr = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        if (nr) {
                /*
                 * Nasty deadlock avoidance.  We may hold various FS locks,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b180459463..72ffa974b0b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
         * all outstanding updates to complete.
         */
-#ifdef COMMIT_STATS
-        spin_lock(&journal->j_list_lock);
-        summarise_journal_usage(journal);
-        spin_unlock(&journal->j_list_lock);
-#endif
        /* Do we need to erase the effects of a prior journal_flush? */
        if (journal->j_flags & JFS_FLUSHED) {
                jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
                   required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                journal_file_buffer(jh, commit_transaction, BJ_Forget);
-                /* Wake up any transactions which were waiting for this
+                /*
-                   IO to complete */
+                 * Wake up any transactions which were waiting for this
+                 * IO to complete. The barrier must be here so that changes
+                 * by journal_file_buffer() take effect before wake_up_bit()
+                 * does the waitqueue check.
+                 */
+                smp_mb();
                wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713afaaa9e..e2d4285fbe90 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
 int __log_start_commit(journal_t *journal, tid_t target)
 {
        /*
-         * Are we already doing a recent enough commit?
+         * The only transaction we can possibly wait upon is the
+         * currently running transaction (if it exists).  Otherwise,
+         * the target tid must be an old one.
         */
-        if (!tid_geq(journal->j_commit_request, target)) {
+        if (journal->j_running_transaction &&
+            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
                          journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
                return 1;
-        }
+        } else if (!tid_geq(journal->j_commit_request, target))
+                /* This should never happen, but if it does, preserve
+                   the evidence before kjournald goes into a loop and
+                   increments j_commit_sequence beyond all recognition. */
+                WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+                    journal->j_commit_request, journal->j_commit_sequence,
+                    target, journal->j_running_transaction ?
+                    journal->j_running_transaction->t_tid : 0);
        return 0;
 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319651b2..f7ee81a065da 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
 * This function is visible to journal users (like ext3fs), so is not
 * called with the journal already locked.
 *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
 */
 handle_t *journal_start(journal_t *journal, int nblocks)
 {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000a4b21..7f21cf3aaf92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
                        ret = err;
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
-                commit_transaction->t_flushed_data_blocks = 1;
                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -338,12 +337,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * all outstanding updates to complete.
         */
-#ifdef COMMIT_STATS
-        spin_lock(&journal->j_list_lock);
-        summarise_journal_usage(journal);
-        spin_unlock(&journal->j_list_lock);
-#endif
        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
        if (journal->j_flags & JBD2_FLUSHED) {
                jbd_debug(3, "super block updated\n");
@@ -678,12 +671,16 @@ start_journal_io:
                err = 0;
        }
+        write_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        commit_transaction->t_state = T_COMMIT_DFLUSH;
+        write_unlock(&journal->j_state_lock);
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
         * the commit record
         */
-        if (commit_transaction->t_flushed_data_blocks &&
+        if (commit_transaction->t_need_data_flush &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -760,8 +757,13 @@ wait_for_iobuf:
                   required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-                /* Wake up any transactions which were waiting for this
+                /*
-                   IO to complete */
+                 * Wake up any transactions which were waiting for this IO to
+                 * complete. The barrier must be here so that changes by
+                 * jbd2_journal_file_buffer() take effect before wake_up_bit()
+                 * does the waitqueue check.
+                 */
+                smp_mb();
                wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
@@ -800,6 +802,10 @@ wait_for_iobuf:
                jbd2_journal_abort(journal, err);
        jbd_debug(3, "JBD: commit phase 5\n");
+        write_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+        commit_transaction->t_state = T_COMMIT_JFLUSH;
+        write_unlock(&journal->j_state_lock);
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -955,7 +961,7 @@ restart_loop:
        jbd_debug(3, "JBD: commit phase 7\n");
-        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
        commit_transaction->t_start = jiffies;
        stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db1c395..9a7826990304 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
        /*
-         * Are we already doing a recent enough commit?
+         * The only transaction we can possibly wait upon is the
+         * currently running transaction (if it exists).  Otherwise,
+         * the target tid must be an old one.
         */
-        if (!tid_geq(journal->j_commit_request, target)) {
+        if (journal->j_running_transaction &&
+            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
                          journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
                return 1;
-        }
+        } else if (!tid_geq(journal->j_commit_request, target))
+                /* This should never happen, but if it does, preserve
+                   the evidence before kjournald goes into a loop and
+                   increments j_commit_sequence beyond all recognition. */
+                WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+                          journal->j_commit_request,
+                          journal->j_commit_sequence,
+                          target, journal->j_running_transaction ? 
+                          journal->j_running_transaction->t_tid : 0);
        return 0;
 }
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 }
 /*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+        int ret = 0;
+        transaction_t *commit_trans;
+        if (!(journal->j_flags & JBD2_BARRIER))
+                return 0;
+        read_lock(&journal->j_state_lock);
+        /* Transaction already committed? */
+        if (tid_geq(journal->j_commit_sequence, tid))
+                goto out;
+        commit_trans = journal->j_committing_transaction;
+        if (!commit_trans || commit_trans->t_tid != tid) {
+                ret = 1;
+                goto out;
+        }
+        /*
+         * Transaction is being committed and we already proceeded to
+         * submitting a flush to fs partition?
+         */
+        if (journal->j_fs_dev != journal->j_dev) {
+                if (!commit_trans->t_need_data_flush ||
+                    commit_trans->t_state >= T_COMMIT_DFLUSH)
+                        goto out;
+        } else {
+                if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+                        goto out;
+        }
+        ret = 1;
+out:
+        read_unlock(&journal->j_state_lock);
+        return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+/*
 * Wait for a specified commit to complete.
 * The caller may not hold the journal lock.
 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a23711..3eec82d32fd4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 */
 /*
- * Update transiaction's maximum wait time, if debugging is enabled.
+ * Update transaction's maximum wait time, if debugging is enabled.
 *
 * In order for t_max_wait to be reliable, it must be protected by a
 * lock.  But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 * means that maximum wait time reported by the jbd2_run_stats
 * tracepoint will always be zero.
 */
-static inline void update_t_max_wait(transaction_t *transaction)
+static inline void update_t_max_wait(transaction_t *transaction,
+                                     unsigned long ts)
 {
 #ifdef CONFIG_JBD2_DEBUG
-        unsigned long ts = jiffies;
        if (jbd2_journal_enable_debug &&
            time_after(transaction->t_start, ts)) {
                ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
        tid_t           tid;
        int             needed, need_to_start;
        int             nblocks = handle->h_buffer_credits;
+        unsigned long ts = jiffies;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
        /* OK, account for the buffers that this operation expects to
         * use and add the handle to the running transaction. 
         */
-        update_t_max_wait(transaction);
+        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
 * This function is visible to journal users (like ext3fs), so is not
 * called with the journal already locked.
 *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
 */
 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
         */
        JBUFFER_TRACE(jh, "cancelling revoke");
        jbd2_journal_cancel_revoke(handle, jh);
-        jbd2_journal_put_journal_head(jh);
 out:
+        jbd2_journal_put_journal_head(jh);
        return err;
 }
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
            jinode->i_next_transaction == transaction)
                goto done;
+        /*
+         * We only ever set this variable to 1 so the test is safe. Since
+         * t_need_data_flush is likely to be set, we do the test to save some
+         * cacheline bouncing
+         */
+        if (!transaction->t_need_data_flush)
+                transaction->t_need_data_flush = 1;
        /* On some different transaction's list - should be
         * the committing one */
        if (jinode->i_transaction) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 82faddd1f321..05f73328b28b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
        int ret;
        uint32_t now = get_seconds();
+        dentry_unhash(dentry);
        for (fd = f->dents ; fd; fd = fd->next) {
                if (fd->ino)
                        return -ENOTEMPTY;
@@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
        uint8_t type;
        uint32_t now;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        /* The VFS will check for us and prevent trying to rename a
         * file over a directory and vice versa, but if it's a directory,
         * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b511e89..865df16a6cf3 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
        jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+        dentry_unhash(dentry);
        /* Init inode for quota operations. */
        dquot_initialize(dip);
        dquot_initialize(ip);
@@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
                 new_dentry->d_name.name);
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        dquot_initialize(old_dir);
        dquot_initialize(new_dir);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 1adc8d455f0e..df0de27c2733 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -10,6 +10,7 @@
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
+#include <linux/prefetch.h>
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1663f8..f34c9cde9e94 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        dentry_unhash(dentry);
        if (!logfs_empty_dir(inode))
                return -ENOTEMPTY;
@@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
        loff_t pos;
        int err;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        /* 1. locate source dd */
        err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
        if (err)
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9e22085231b3..d8d09380c7de 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -481,7 +481,7 @@ static int inode_write_alias(struct super_block *sb,
                        val = inode_val0(inode);
                        break;
                case INODE_USED_OFS:
-                        val = cpu_to_be64(li->li_used_bytes);;
+                        val = cpu_to_be64(li->li_used_bytes);
                        break;
                case INODE_SIZE_OFS:
                        val = cpu_to_be64(i_size_read(inode));
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2f174be06555..8c32ef3ba88e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock);
 * What the mbcache registers as to get shrunk dynamically.
 */
-static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink,
+                              struct shrink_control *sc);
 static struct shrinker mb_cache_shrinker = {
        .shrink = mb_cache_shrink_fn,
@@ -156,18 +157,19 @@ forget:
 * gets low.
 *
 * @shrink: (ignored)
- * @nr_to_scan: Number of objects to scan
+ * @sc: shrink_control passed from reclaim
- * @gfp_mask: (ignored)
 *
 * Returns the number of objects which are present in the cache.
 */
 static int
-mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
 {
        LIST_HEAD(free_list);
        struct mb_cache *cache;
        struct mb_cache_entry *entry, *tmp;
        int count = 0;
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        mb_debug("trying to free %d entries", nr_to_scan);
        spin_lock(&mb_cache_spinlock);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..f60aed8db9c4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err = -ENOTEMPTY;
+        dentry_unhash(dentry);
        if (minix_empty_dir(inode)) {
                err = minix_unlink(dir, dentry);
                if (!err) {
@@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
        struct minix_dir_entry * old_de;
        int err = -ENOENT;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_de = minix_find_entry(old_dentry, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0afc809e46e0..fdfae9fa98cd 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/cleancache.h>
 /*
 * I/O completion handler for multipage BIOs.
@@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
                SetPageMappedToDisk(page);
        }
+        if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+            cleancache_get_page(page) == 0) {
+                SetPageUptodate(page);
+                goto confused;
+        }
        /*
         * This page will go to BIO.  Do we need to send this BIO off first?
         */
diff --git a/fs/namei.c b/fs/namei.c
index e3c4f112ebf7..2358b326b221 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -391,79 +391,28 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
-/**
+/*
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
 * Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
- * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
- * refcounts at the last known good point before rcu-walk got stuck, so
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * got stuck, so ref-walk may continue from there. If this is not successful
- * has changed), then failure is returned and path walk restarts from the
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
- * beginning in ref-walk mode.
+ * to restart the path walk from the beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
 */
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
-        struct fs_struct *fs = current->fs;
-        struct dentry *dentry = nd->path.dentry;
-        int want_root = 0;
-        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-                want_root = 1;
-                spin_lock(&fs->lock);
-                if (nd->root.mnt != fs->root.mnt ||
-                                nd->root.dentry != fs->root.dentry)
-                        goto err_root;
-        }
-        spin_lock(&dentry->d_lock);
-        if (!__d_rcu_to_refcount(dentry, nd->seq))
-                goto err;
-        BUG_ON(nd->inode != dentry->d_inode);
-        spin_unlock(&dentry->d_lock);
-        if (want_root) {
-                path_get(&nd->root);
-                spin_unlock(&fs->lock);
-        }
-        mntget(nd->path.mnt);
-        rcu_read_unlock();
-        br_read_unlock(vfsmount_lock);
-        nd->flags &= ~LOOKUP_RCU;
-        return 0;
-err:
-        spin_unlock(&dentry->d_lock);
-err_root:
-        if (want_root)
-                spin_unlock(&fs->lock);
-        return -ECHILD;
-}
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
-        if (nd->flags & LOOKUP_RCU)
-                return nameidata_drop_rcu(nd);
-        return 0;
-}
 /**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * unlazy_walk - try to switch to ref-walk mode.
- * @nd: nameidata pathwalk data to drop
+ * @nd: nameidata pathwalk data
- * @dentry: dentry to drop
+ * @dentry: child of nd->path.dentry or NULL
 * Returns: 0 on success, -ECHILD on failure
 *
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * @nd or NULL.  Must be called from rcu-walk context.
 */
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
@@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
                        goto err_root;
        }
        spin_lock(&parent->d_lock);
-        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+        if (!dentry) {
-        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                if (!__d_rcu_to_refcount(parent, nd->seq))
-                goto err;
+                        goto err_parent;
-        /*
+                BUG_ON(nd->inode != parent->d_inode);
-         * If the sequence check on the child dentry passed, then the child has
+        } else {
-         * not been removed from its parent. This means the parent dentry must
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-         * be valid and able to take a reference at this point.
+                if (!__d_rcu_to_refcount(dentry, nd->seq))
-         */
+                        goto err_child;
-        BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+                /*
-        BUG_ON(!parent->d_count);
+                 * If the sequence check on the child dentry passed, then
-        parent->d_count++;
+                 * the child has not been removed from its parent. This
-        spin_unlock(&dentry->d_lock);
+                 * means the parent dentry must be valid and able to take
+                 * a reference at this point.
+                 */
+                BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+                BUG_ON(!parent->d_count);
+                parent->d_count++;
+                spin_unlock(&dentry->d_lock);
+        }
        spin_unlock(&parent->d_lock);
        if (want_root) {
                path_get(&nd->root);
@@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        br_read_unlock(vfsmount_lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
-err:
+err_child:
        spin_unlock(&dentry->d_lock);
+err_parent:
        spin_unlock(&parent->d_lock);
 err_root:
        if (want_root)
@@ -510,59 +468,6 @@ err_root:
        return -ECHILD;
 }
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
-        if (nd->flags & LOOKUP_RCU) {
-                if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
-                        nd->flags &= ~LOOKUP_RCU;
-                        if (!(nd->flags & LOOKUP_ROOT))
-                                nd->root.mnt = NULL;
-                        rcu_read_unlock();
-                        br_read_unlock(vfsmount_lock);
-                        return -ECHILD;
-                }
-        }
-        return 0;
-}
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
-        struct dentry *dentry = nd->path.dentry;
-        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        nd->flags &= ~LOOKUP_RCU;
-        if (!(nd->flags & LOOKUP_ROOT))
-                nd->root.mnt = NULL;
-        spin_lock(&dentry->d_lock);
-        if (!__d_rcu_to_refcount(dentry, nd->seq))
-                goto err_unlock;
-        BUG_ON(nd->inode != dentry->d_inode);
-        spin_unlock(&dentry->d_lock);
-        mntget(nd->path.mnt);
-        rcu_read_unlock();
-        br_read_unlock(vfsmount_lock);
-        return 0;
-err_unlock:
-        spin_unlock(&dentry->d_lock);
-        rcu_read_unlock();
-        br_read_unlock(vfsmount_lock);
-        return -ECHILD;
-}
 /**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
@@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
        return dentry;
 }
-/*
+/**
- * handle_reval_path - force revalidation of a dentry
+ * complete_walk - successful completion of path walk
- *
+ * @nd:  pointer nameidata
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
 *
- * Returns 0 if the revalidation was successful. If the revalidation fails,
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
- * either return the error returned by d_revalidate or -ESTALE if the
+ * Revalidate the final result, unless we'd already done that during
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
- * invalidate the dentry. It's up to the caller to handle putting references
+ * success, -error on failure.  In case of failure caller does not
- * to the path if necessary.
+ * need to drop nd->path.
 */
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
 {
        struct dentry *dentry = nd->path.dentry;
        int status;
+        if (nd->flags & LOOKUP_RCU) {
+                nd->flags &= ~LOOKUP_RCU;
+                if (!(nd->flags & LOOKUP_ROOT))
+                        nd->root.mnt = NULL;
+                spin_lock(&dentry->d_lock);
+                if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+                        spin_unlock(&dentry->d_lock);
+                        rcu_read_unlock();
+                        br_read_unlock(vfsmount_lock);
+                        return -ECHILD;
+                }
+                BUG_ON(nd->inode != dentry->d_inode);
+                spin_unlock(&dentry->d_lock);
+                mntget(nd->path.mnt);
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
+        }
        if (likely(!(nd->flags & LOOKUP_JUMPED)))
                return 0;
@@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd)
        if (!status)
                status = -ESTALE;
+        path_put(&nd->path);
        return status;
 }
@@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                if (likely(__follow_mount_rcu(nd, path, inode, false)))
                        return 0;
 unlazy:
-                if (dentry) {
+                if (unlazy_walk(nd, dentry))
-                        if (nameidata_dentry_drop_rcu(nd, dentry))
+                        return -ECHILD;
-                                return -ECHILD;
-                } else {
-                        if (nameidata_drop_rcu(nd))
-                                return -ECHILD;
-                }
        } else {
                dentry = __d_lookup(parent, name);
        }
@@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd)
                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
                if (err != -ECHILD)
                        return err;
-                if (nameidata_drop_rcu(nd))
+                if (unlazy_walk(nd, NULL))
                        return -ECHILD;
        }
        return exec_permission(nd->inode, 0);
@@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
                return -ENOENT;
        }
        if (unlikely(inode->i_op->follow_link) && follow) {
-                if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+                if (nd->flags & LOOKUP_RCU) {
-                        return -ECHILD;
+                        if (unlikely(unlazy_walk(nd, path->dentry))) {
+                                terminate_walk(nd);
+                                return -ECHILD;
+                        }
+                }
                BUG_ON(inode != path->dentry->d_inode);
                return 1;
        }
@@ -1378,12 +1296,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 {
        int res;
-        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
        if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
                path_put_conditional(path, nd);
                path_put(&nd->path);
                return -ELOOP;
        }
+        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
        nd->depth++;
        current->link_count++;
@@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name,
                }
        }
-        if (nd->flags & LOOKUP_RCU) {
+        if (!err)
-                /* went all way through without dropping RCU */
+                err = complete_walk(nd);
-                BUG_ON(err);
-                if (nameidata_drop_rcu_last(nd))
-                        err = -ECHILD;
-        }
-        if (!err) {
-                err = handle_reval_path(nd);
-                if (err)
-                        path_put(&nd->path);
-        }
        if (!err && nd->flags & LOOKUP_DIRECTORY) {
                if (!nd->inode->i_op->lookup) {
@@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        return ERR_PTR(error);
                /* fallthrough */
        case LAST_ROOT:
-                if (nd->flags & LOOKUP_RCU) {
+                error = complete_walk(nd);
-                        if (nameidata_drop_rcu_last(nd))
-                                return ERR_PTR(-ECHILD);
-                }
-                error = handle_reval_path(nd);
                if (error)
-                        goto exit;
+                        return ERR_PTR(error);
                audit_inode(pathname, nd->path.dentry);
                if (open_flag & O_CREAT) {
                        error = -EISDIR;
@@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                }
                goto ok;
        case LAST_BIND:
-                /* can't be RCU mode here */
+                error = complete_walk(nd);
-                error = handle_reval_path(nd);
                if (error)
-                        goto exit;
+                        return ERR_PTR(error);
                audit_inode(pathname, dir);
                goto ok;
        }
@@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                if (error) /* symlink */
                        return NULL;
                /* sayonara */
-                if (nd->flags & LOOKUP_RCU) {
+                error = complete_walk(nd);
-                        if (nameidata_drop_rcu_last(nd))
+                if (error)
-                                return ERR_PTR(-ECHILD);
+                        return ERR_PTR(-ECHILD);
-                }
                error = -ENOTDIR;
                if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        }
        /* create side of things */
+        error = complete_walk(nd);
-        if (nd->flags & LOOKUP_RCU) {
+        if (error)
-                if (nameidata_drop_rcu_last(nd))
+                return ERR_PTR(error);
-                        return ERR_PTR(-ECHILD);
-        }
        audit_inode(pathname, dir);
        error = -EISDIR;
@@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 }
 /*
- * We try to drop the dentry early: we should have
+ * The dentry_unhash() helper will try to drop the dentry early: we
- * a usage count of 2 if we're the only user of this
+ * should have a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
+ * dentry, and if that is true (possibly after pruning the dcache),
- * the dcache), then we drop the dentry now.
+ * then we drop the dentry now.
 *
 * A low-level filesystem can, if it choses, legally
 * do a
@@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 */
 void dentry_unhash(struct dentry *dentry)
 {
-        dget(dentry);
        shrink_dcache_parent(dentry);
        spin_lock(&dentry->d_lock);
-        if (dentry->d_count == 2)
+        if (dentry->d_count == 1)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
@@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EPERM;
        mutex_lock(&dentry->d_inode->i_mutex);
-        dentry_unhash(dentry);
+        error = -EBUSY;
        if (d_mountpoint(dentry))
-                error = -EBUSY;
+                goto out;
-        else {
-                error = security_inode_rmdir(dir, dentry);
+        error = security_inode_rmdir(dir, dentry);
-                if (!error) {
+        if (error)
-                        error = dir->i_op->rmdir(dir, dentry);
+                goto out;
-                        if (!error) {
-                                dentry->d_inode->i_flags |= S_DEAD;
+        error = dir->i_op->rmdir(dir, dentry);
-                                dont_mount(dentry);
+        if (error)
-                        }
+                goto out;
-                }
-        }
+        dentry->d_inode->i_flags |= S_DEAD;
+        dont_mount(dentry);
+out:
        mutex_unlock(&dentry->d_inode->i_mutex);
-        if (!error) {
+        if (!error)
                d_delete(dentry);
-        }
-        dput(dentry);
        return error;
 }
@@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 *         HOWEVER, it relies on the assumption that any object with ->lookup()
 *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *         we'd better make sure that there's no link(2) for them.
- *      d) some filesystems don't support opened-but-unlinked directories,
+ *      d) conversion from fhandle to dentry may come in the wrong moment - when
- *         either because of layout or because they are not ready to deal with
- *         all cases correctly. The latter will be fixed (taking this sort of
- *         stuff into VFS), but the former is not going away. Solution: the same
- *         trick as in rmdir().
- *      e) conversion from fhandle to dentry may come in the wrong moment - when
 *         we are removing the target. Solution: we will have to grab ->i_mutex
 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *         ->i_mutex on parents, which works but leads to some truly excessive
@@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry)
 {
        int error = 0;
-        struct inode *target;
+        struct inode *target = new_dentry->d_inode;
        /*
         * If we are going to change the parent - check write permissions,
@@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
-        target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
-        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-                error = -EBUSY;
+        error = -EBUSY;
-        else {
+        if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
-                if (target)
+                goto out;
-                        dentry_unhash(new_dentry);
-                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-        }
+        if (error)
+                goto out;
        if (target) {
-                if (!error) {
+                target->i_flags |= S_DEAD;
-                        target->i_flags |= S_DEAD;
+                dont_mount(new_dentry);
-                        dont_mount(new_dentry);
-                }
-                mutex_unlock(&target->i_mutex);
-                if (d_unhashed(new_dentry))
-                        d_rehash(new_dentry);
-                dput(new_dentry);
        }
+out:
+        if (target)
+                mutex_unlock(&target->i_mutex);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
@@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                            struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct inode *target;
+        struct inode *target = new_dentry->d_inode;
        int error;
        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                return error;
        dget(new_dentry);
-        target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
+        error = -EBUSY;
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-                error = -EBUSY;
+                goto out;
-        else
-                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-        if (!error) {
+        if (error)
-                if (target)
+                goto out;
-                        dont_mount(new_dentry);
-                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+        if (target)
-                        d_move(old_dentry, new_dentry);
+                dont_mount(new_dentry);
-        }
+        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+                d_move(old_dentry, new_dentry);
+out:
        if (target)
                mutex_unlock(&target->i_mutex);
        dput(new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d99bcf59e4c2..fe59bd145d21 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 static int flags_to_propagation_type(int flags)
 {
-        int type = flags & ~MS_REC;
+        int type = flags & ~(MS_REC | MS_SILENT);
        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f6946bb5cb55..e3e646b06404 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
        DPRINTK("ncp_rmdir: removing %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
+        dentry_unhash(dentry);
        error = -EBUSY;
        if (!d_unhashed(dentry))
                goto out;
@@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        ncp_age_dentry(server, old_dentry);
        ncp_age_dentry(server, new_dentry);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0250e4ce4893..202f370526a7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 #endif
        struct ncp_entry_info finfo;
-        data.wdog_pid = NULL;
+        memset(&data, 0, sizeof(data));
        server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
        if (!server)
                return -ENOMEM;
@@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                                struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
                                data.flags = md->flags;
-                                data.int_flags = 0;
                                data.mounted_uid = md->mounted_uid;
                                data.wdog_pid = find_get_pid(md->wdog_pid);
                                data.ncp_fd = md->ncp_fd;
@@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                                data.file_mode = md->file_mode;
                                data.dir_mode = md->dir_mode;
                                data.info_fd = -1;
-                                data.mounted_vol[0] = 0;
                        }
                        break;
                default:
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index a7c07b44b100..e5d71b27a5b0 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,6 +16,7 @@
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/fcntl.h>
+#include <linux/memcontrol.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
         * -- wli
         */
        count_vm_event(PGMAJFAULT);
+        mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
        return VM_FAULT_MAJOR;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672216c8..424e47773a84 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head)
        }
 }
-int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink,
+                              struct shrink_control *sc)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                return (nr_to_scan == 0) ? 0 : -1;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce885dd..2df6ca7b5898 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp,
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
-                                        int nr_to_scan, gfp_t gfp_mask);
+                                        struct shrink_control *sc);
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 5232d3e8fb2f..a2e2402b2afb 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -8,7 +8,7 @@
 *                      Statistsics for the reply cache
 *      fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
 *                      statistics for filehandle lookup
- *      io <bytes-read> <bytes-writtten>
+ *      io <bytes-read> <bytes-written>
 *                      statistics for IO throughput
 *      th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> 
 *                      time (seconds) when nfsd thread usage above thresholds
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index f7684483785e..eed4d7b26249 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -489,8 +489,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 void nilfs_palloc_commit_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
-        nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+        mark_buffer_dirty(req->pr_bitmap_bh);
-        nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+        mark_buffer_dirty(req->pr_desc_bh);
        nilfs_mdt_mark_dirty(inode);
        brelse(req->pr_bitmap_bh);
@@ -527,8 +527,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        kunmap(req->pr_bitmap_bh->b_page);
        kunmap(req->pr_desc_bh->b_page);
-        nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+        mark_buffer_dirty(req->pr_desc_bh);
-        nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+        mark_buffer_dirty(req->pr_bitmap_bh);
        nilfs_mdt_mark_dirty(inode);
        brelse(req->pr_bitmap_bh);
@@ -683,8 +683,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                kunmap(bitmap_bh->b_page);
                kunmap(desc_bh->b_page);
-                nilfs_mdt_mark_buffer_dirty(desc_bh);
+                mark_buffer_dirty(desc_bh);
-                nilfs_mdt_mark_buffer_dirty(bitmap_bh);
+                mark_buffer_dirty(bitmap_bh);
                nilfs_mdt_mark_dirty(inode);
                brelse(bitmap_bh);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 4723f04e9b12..aadbd0b5e3e8 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -34,7 +34,9 @@
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-        return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+        struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info;
+        return nilfs->ns_dat;
 }
 static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 609cd223eea8..a35ae35e6932 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,12 +34,6 @@
 #include "page.h"
 #include "btnode.h"
-void nilfs_btnode_cache_init(struct address_space *btnc,
-                             struct backing_dev_info *bdi)
-{
-        nilfs_mapping_init(btnc, bdi);
-}
 void nilfs_btnode_cache_clear(struct address_space *btnc)
 {
        invalidate_mapping_pages(btnc, 0, -1);
@@ -62,7 +56,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
                BUG();
        }
        memset(bh->b_data, 0, 1 << inode->i_blkbits);
-        bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+        bh->b_bdev = inode->i_sb->s_bdev;
        bh->b_blocknr = blocknr;
        set_buffer_mapped(bh);
        set_buffer_uptodate(bh);
@@ -94,10 +88,11 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        if (pblocknr == 0) {
                pblocknr = blocknr;
                if (inode->i_ino != NILFS_DAT_INO) {
-                        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
+                        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
                        /* blocknr is a virtual block number */
-                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
+                        err = nilfs_dat_translate(nilfs->ns_dat, blocknr,
+                                                  &pblocknr);
                        if (unlikely(err)) {
                                brelse(bh);
                                goto out_locked;
@@ -120,7 +115,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                goto found;
        }
        set_buffer_mapped(bh);
-        bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+        bh->b_bdev = inode->i_sb->s_bdev;
        bh->b_blocknr = pblocknr; /* set block address for read */
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
@@ -259,7 +254,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
                                       "invalid oldkey %lld (newkey=%lld)",
                                       (unsigned long long)oldkey,
                                       (unsigned long long)newkey);
-                nilfs_btnode_mark_dirty(obh);
+                mark_buffer_dirty(obh);
                spin_lock_irq(&btnc->tree_lock);
                radix_tree_delete(&btnc->page_tree, oldkey);
@@ -271,7 +266,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
                unlock_page(opage);
        } else {
                nilfs_copy_buffer(nbh, obh);
-                nilfs_btnode_mark_dirty(nbh);
+                mark_buffer_dirty(nbh);
                nbh->b_blocknr = newkey;
                ctxt->bh = nbh;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 1b8ebd888c28..3a4dd2d8d3fc 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
        struct buffer_head *newbh;
 };
-void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
                                              __u64 blocknr);
@@ -51,7 +50,4 @@ void nilfs_btnode_commit_change_key(struct address_space *,
 void nilfs_btnode_abort_change_key(struct address_space *,
                                   struct nilfs_btnode_chkey_ctxt *);
-#define nilfs_btnode_mark_dirty(bh)     nilfs_mark_buffer_dirty(bh)
 #endif  /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index d451ae0e0bf3..7eafe468a29c 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -714,7 +714,7 @@ static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
                                nilfs_btree_get_nonroot_node(path, level),
                                path[level].bp_index, key);
                        if (!buffer_dirty(path[level].bp_bh))
-                                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                                mark_buffer_dirty(path[level].bp_bh);
                } while ((path[level].bp_index == 0) &&
                         (++level < nilfs_btree_height(btree) - 1));
        }
@@ -739,7 +739,7 @@ static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
                nilfs_btree_node_insert(node, path[level].bp_index,
                                        *keyp, *ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
-                        nilfs_btnode_mark_dirty(path[level].bp_bh);
+                        mark_buffer_dirty(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
@@ -777,9 +777,9 @@ static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+                mark_buffer_dirty(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -823,9 +823,9 @@ static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+                mark_buffer_dirty(path[level].bp_sib_bh);
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
@@ -870,9 +870,9 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+                mark_buffer_dirty(path[level].bp_sib_bh);
        newkey = nilfs_btree_node_get_key(right, 0);
        newptr = path[level].bp_newreq.bpr_ptr;
@@ -919,7 +919,7 @@ static void nilfs_btree_grow(struct nilfs_bmap *btree,
        nilfs_btree_node_set_level(root, level + 1);
        if (!buffer_dirty(path[level].bp_sib_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+                mark_buffer_dirty(path[level].bp_sib_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
@@ -1194,7 +1194,7 @@ static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
                nilfs_btree_node_delete(node, path[level].bp_index,
                                        keyp, ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
-                        nilfs_btnode_mark_dirty(path[level].bp_bh);
+                        mark_buffer_dirty(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -1226,9 +1226,9 @@ static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
        nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+                mark_buffer_dirty(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -1258,9 +1258,9 @@ static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+                mark_buffer_dirty(path[level].bp_sib_bh);
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
@@ -1289,7 +1289,7 @@ static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_sib_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+                mark_buffer_dirty(path[level].bp_sib_bh);
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
@@ -1315,7 +1315,7 @@ static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                mark_buffer_dirty(path[level].bp_bh);
        nilfs_btnode_delete(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
@@ -1709,7 +1709,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
                nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
                nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
                if (!buffer_dirty(bh))
-                        nilfs_btnode_mark_dirty(bh);
+                        mark_buffer_dirty(bh);
                if (!nilfs_bmap_dirty(btree))
                        nilfs_bmap_set_dirty(btree);
@@ -1787,7 +1787,7 @@ static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
 {
        while ((++level < nilfs_btree_height(btree) - 1) &&
               !buffer_dirty(path[level].bp_bh))
-                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                mark_buffer_dirty(path[level].bp_bh);
        return 0;
 }
@@ -2229,7 +2229,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
        }
        if (!buffer_dirty(bh))
-                nilfs_btnode_mark_dirty(bh);
+                mark_buffer_dirty(bh);
        brelse(bh);
        if (!nilfs_bmap_dirty(btree))
                nilfs_bmap_set_dirty(btree);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 5ff15a8a1024..c9b342c8b503 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -216,14 +216,14 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
                if (!nilfs_cpfile_is_in_first(cpfile, cno))
                        nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
                                                                 kaddr, 1);
-                nilfs_mdt_mark_buffer_dirty(cp_bh);
+                mark_buffer_dirty(cp_bh);
                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
                header = nilfs_cpfile_block_get_header(cpfile, header_bh,
                                                       kaddr);
                le64_add_cpu(&header->ch_ncheckpoints, 1);
                kunmap_atomic(kaddr, KM_USER0);
-                nilfs_mdt_mark_buffer_dirty(header_bh);
+                mark_buffer_dirty(header_bh);
                nilfs_mdt_mark_dirty(cpfile);
        }
@@ -326,7 +326,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                }
                if (nicps > 0) {
                        tnicps += nicps;
-                        nilfs_mdt_mark_buffer_dirty(cp_bh);
+                        mark_buffer_dirty(cp_bh);
                        nilfs_mdt_mark_dirty(cpfile);
                        if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
                                count =
@@ -358,7 +358,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                header = nilfs_cpfile_block_get_header(cpfile, header_bh,
                                                       kaddr);
                le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
-                nilfs_mdt_mark_buffer_dirty(header_bh);
+                mark_buffer_dirty(header_bh);
                nilfs_mdt_mark_dirty(cpfile);
                kunmap_atomic(kaddr, KM_USER0);
        }
@@ -671,10 +671,10 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
        le64_add_cpu(&header->ch_nsnapshots, 1);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(prev_bh);
+        mark_buffer_dirty(prev_bh);
-        nilfs_mdt_mark_buffer_dirty(curr_bh);
+        mark_buffer_dirty(curr_bh);
-        nilfs_mdt_mark_buffer_dirty(cp_bh);
+        mark_buffer_dirty(cp_bh);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
+        mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_dirty(cpfile);
        brelse(prev_bh);
@@ -774,10 +774,10 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
        le64_add_cpu(&header->ch_nsnapshots, -1);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(next_bh);
+        mark_buffer_dirty(next_bh);
-        nilfs_mdt_mark_buffer_dirty(prev_bh);
+        mark_buffer_dirty(prev_bh);
-        nilfs_mdt_mark_buffer_dirty(cp_bh);
+        mark_buffer_dirty(cp_bh);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
+        mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_dirty(cpfile);
        brelse(prev_bh);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 59e5fe742f7b..fcc2f869af16 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -54,7 +54,7 @@ static int nilfs_dat_prepare_entry(struct inode *dat,
 static void nilfs_dat_commit_entry(struct inode *dat,
                                   struct nilfs_palloc_req *req)
 {
-        nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
+        mark_buffer_dirty(req->pr_entry_bh);
        nilfs_mdt_mark_dirty(dat);
        brelse(req->pr_entry_bh);
 }
@@ -361,7 +361,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
        entry->de_blocknr = cpu_to_le64(blocknr);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(entry_bh);
+        mark_buffer_dirty(entry_bh);
        nilfs_mdt_mark_dirty(dat);
        brelse(entry_bh);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 397e73258631..d7eeca62febd 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -111,7 +111,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        nilfs_transaction_commit(inode->i_sb);
 mapped:
-        SetPageChecked(page);
        wait_on_page_writeback(page);
        return VM_FAULT_LOCKED;
 }
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1c2a3e23f8b2..08a07a218d26 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -48,9 +48,6 @@
 #include "dat.h"
 #include "ifile.h"
-static const struct address_space_operations def_gcinode_aops = {
-};
 /*
 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
 * @inode - gc inode
@@ -87,9 +84,9 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
                goto out;
        if (pbn == 0) {
-                struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
+                struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-                                          /* use original dat, not gc dat. */
-                err = nilfs_dat_translate(dat_inode, vbn, &pbn);
+                err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
                if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
                        brelse(bh);
                        goto failed;
@@ -103,7 +100,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
        }
        if (!buffer_mapped(bh)) {
-                bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+                bh->b_bdev = inode->i_sb->s_bdev;
                set_buffer_mapped(bh);
        }
        bh->b_blocknr = pbn;
@@ -160,15 +157,11 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
        if (buffer_dirty(bh))
                return -EEXIST;
-        if (buffer_nilfs_node(bh)) {
+        if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) {
-                if (nilfs_btree_broken_node_block(bh)) {
+                clear_buffer_uptodate(bh);
-                        clear_buffer_uptodate(bh);
+                return -EIO;
-                        return -EIO;
-                }
-                nilfs_btnode_mark_dirty(bh);
-        } else {
-                nilfs_mark_buffer_dirty(bh);
        }
+        mark_buffer_dirty(bh);
        return 0;
 }
@@ -178,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode)
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
-        inode->i_mapping->a_ops = &def_gcinode_aops;
+        inode->i_mapping->a_ops = &empty_aops;
        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        ii->i_flags = 0;
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index bfc73d3a30ed..684d76300a80 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -80,7 +80,7 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
                return ret;
        }
        nilfs_palloc_commit_alloc_entry(ifile, &req);
-        nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+        mark_buffer_dirty(req.pr_entry_bh);
        nilfs_mdt_mark_dirty(ifile);
        *out_ino = (ino_t)req.pr_entry_nr;
        *out_bh = req.pr_entry_bh;
@@ -128,7 +128,7 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
        raw_inode->i_flags = 0;
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+        mark_buffer_dirty(req.pr_entry_bh);
        brelse(req.pr_entry_bh);
        nilfs_palloc_commit_free_entry(ifile, &req);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index c0aa27490c02..587f18432832 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -74,14 +74,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                    struct buffer_head *bh_result, int create)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        __u64 blknum = 0;
        int err = 0, ret;
-        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
-        down_read(&NILFS_MDT(dat)->mi_sem);
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
-        up_read(&NILFS_MDT(dat)->mi_sem);
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        if (ret >= 0) { /* found */
                map_bh(bh_result, inode->i_sb, blknum);
                if (ret > 0)
@@ -596,6 +596,16 @@ void nilfs_write_inode_common(struct inode *inode,
        raw_inode->i_flags = cpu_to_le32(ii->i_flags);
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+        if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
+                struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+                /* zero-fill unused portion in the case of super root block */
+                raw_inode->i_xattr = 0;
+                raw_inode->i_pad = 0;
+                memset((void *)raw_inode + sizeof(*raw_inode), 0,
+                       nilfs->ns_inode_size - sizeof(*raw_inode));
+        }
        if (has_bmap)
                nilfs_bmap_write(ii->i_bmap, raw_inode);
        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -872,8 +882,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
                        return -EINVAL; /* NILFS_I_DIRTY may remain for
                                           freeing inode */
                }
-                list_del(&ii->i_dirty);
+                list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
-                list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
                set_bit(NILFS_I_QUEUED, &ii->i_state);
        }
        spin_unlock(&nilfs->ns_inode_lock);
@@ -892,7 +901,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
                return err;
        }
        nilfs_update_inode(inode, ibh);
-        nilfs_mdt_mark_buffer_dirty(ibh);
+        mark_buffer_dirty(ibh);
        nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
        brelse(ibh);
        return 0;
@@ -931,7 +940,7 @@ void nilfs_dirty_inode(struct inode *inode)
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 __u64 start, __u64 len)
 {
-        struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        __u64 logical = 0, phys = 0, size = 0;
        __u32 flags = 0;
        loff_t isize;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f2469ba6246b..41d6743d303c 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -698,6 +698,63 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
        return 0;
 }
+static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
+                              void __user *argp)
+{
+        __u64 newsize;
+        int ret = -EPERM;
+        if (!capable(CAP_SYS_ADMIN))
+                goto out;
+        ret = mnt_want_write(filp->f_path.mnt);
+        if (ret)
+                goto out;
+        ret = -EFAULT;
+        if (copy_from_user(&newsize, argp, sizeof(newsize)))
+                goto out_drop_write;
+        ret = nilfs_resize_fs(inode->i_sb, newsize);
+out_drop_write:
+        mnt_drop_write(filp->f_path.mnt);
+out:
+        return ret;
+}
+static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
+{
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+        __u64 range[2];
+        __u64 minseg, maxseg;
+        unsigned long segbytes;
+        int ret = -EPERM;
+        if (!capable(CAP_SYS_ADMIN))
+                goto out;
+        ret = -EFAULT;
+        if (copy_from_user(range, argp, sizeof(__u64[2])))
+                goto out;
+        ret = -ERANGE;
+        if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+                goto out;
+        segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
+        minseg = range[0] + segbytes - 1;
+        do_div(minseg, segbytes);
+        maxseg = NILFS_SB2_OFFSET_BYTES(range[1]);
+        do_div(maxseg, segbytes);
+        maxseg--;
+        ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
+out:
+        return ret;
+}
 static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
                                unsigned int cmd, void __user *argp,
                                size_t membsz,
@@ -763,6 +820,10 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
        case NILFS_IOCTL_SYNC:
                return nilfs_ioctl_sync(inode, filp, cmd, argp);
+        case NILFS_IOCTL_RESIZE:
+                return nilfs_ioctl_resize(inode, filp, argp);
+        case NILFS_IOCTL_SET_ALLOC_RANGE:
+                return nilfs_ioctl_set_alloc_range(inode, argp);
        default:
                return -ENOTTY;
        }
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index a649b05f7069..800e8d78a83b 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -66,7 +66,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
        kunmap_atomic(kaddr, KM_USER0);
        set_buffer_uptodate(bh);
-        nilfs_mark_buffer_dirty(bh);
+        mark_buffer_dirty(bh);
        nilfs_mdt_mark_dirty(inode);
        return 0;
 }
@@ -355,7 +355,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
        err = nilfs_mdt_read_block(inode, block, 0, &bh);
        if (unlikely(err))
                return err;
-        nilfs_mark_buffer_dirty(bh);
+        mark_buffer_dirty(bh);
        nilfs_mdt_mark_dirty(inode);
        brelse(bh);
        return 0;
@@ -450,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
        INIT_LIST_HEAD(&shadow->frozen_buffers);
        address_space_init_once(&shadow->frozen_data);
-        nilfs_mapping_init(&shadow->frozen_data, bdi);
+        nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
        address_space_init_once(&shadow->frozen_btnodes);
-        nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
+        nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
        mi->mi_shadow = shadow;
        return 0;
 }
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ed68563ec708..ab20a4baa50f 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -64,11 +64,6 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
        return inode->i_private;
 }
-static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
-{
-        return inode->i_sb->s_fs_info;
-}
 /* Default GFP flags using highmem */
 #define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
@@ -93,8 +88,6 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
 struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
                                                struct buffer_head *bh);
-#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
 static inline void nilfs_mdt_mark_dirty(struct inode *inode)
 {
        if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
@@ -108,7 +101,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
 static inline __u64 nilfs_mdt_cno(struct inode *inode)
 {
-        return NILFS_I_NILFS(inode)->ns_cno;
+        return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
 }
 #define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b3e88f..1102a5fbb744 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct nilfs_transaction_info ti;
        int err;
+        dentry_unhash(dentry);
        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
        if (err)
                return err;
@@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct nilfs_transaction_info ti;
        int err;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
        if (unlikely(err))
                return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a8dd344303cb..a9c6a531f80c 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -80,12 +80,6 @@ static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
        return &ii->vfs_inode;
 }
-static inline struct inode *NILFS_AS_I(struct address_space *mapping)
-{
-        return (mapping->host) ? :
-                container_of(mapping, struct inode, i_data);
-}
 /*
 * Dynamic state flags of NILFS on-memory inode (i_state)
 */
@@ -298,6 +292,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
                                               int flip);
 int nilfs_commit_super(struct super_block *sb, int flag);
 int nilfs_cleanup_super(struct super_block *sb);
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize);
 int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
                            struct nilfs_root **root);
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 1168059c7efd..65221a04c6f0 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,8 +37,7 @@
 #define NILFS_BUFFER_INHERENT_BITS  \
        ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
-         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
+         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
-         (1UL << BH_NILFS_Checked))
 static struct buffer_head *
 __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -59,19 +58,6 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
        return bh;
 }
-/*
- * Since the page cache of B-tree node pages or data page cache of pseudo
- * inodes does not have a valid mapping->host pointer, calling
- * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
- * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
- * To avoid this problem, the old style mark_buffer_dirty() is used instead.
- */
-void nilfs_mark_buffer_dirty(struct buffer_head *bh)
-{
-        if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
-                __set_page_dirty_nobuffers(bh->b_page);
-}
 struct buffer_head *nilfs_grab_buffer(struct inode *inode,
                                      struct address_space *mapping,
                                      unsigned long blkoff,
@@ -183,7 +169,7 @@ int nilfs_page_buffers_clean(struct page *page)
 void nilfs_page_bug(struct page *page)
 {
        struct address_space *m;
-        unsigned long ino = 0;
+        unsigned long ino;
        if (unlikely(!page)) {
                printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
@@ -191,11 +177,8 @@ void nilfs_page_bug(struct page *page)
        }
        m = page->mapping;
-        if (m) {
+        ino = m ? m->host->i_ino : 0;
-                struct inode *inode = NILFS_AS_I(m);
-                if (inode != NULL)
-                        ino = inode->i_ino;
-        }
        printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
               "mapping=%p ino=%lu\n",
               page, atomic_read(&page->_count),
@@ -217,56 +200,6 @@ void nilfs_page_bug(struct page *page)
 }
 /**
- * nilfs_alloc_private_page - allocate a private page with buffer heads
- *
- * Return Value: On success, a pointer to the allocated page is returned.
- * On error, NULL is returned.
- */
-struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
-                                      unsigned long state)
-{
-        struct buffer_head *bh, *head, *tail;
-        struct page *page;
-        page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
-        if (unlikely(!page))
-                return NULL;
-        lock_page(page);
-        head = alloc_page_buffers(page, size, 0);
-        if (unlikely(!head)) {
-                unlock_page(page);
-                __free_page(page);
-                return NULL;
-        }
-        bh = head;
-        do {
-                bh->b_state = (1UL << BH_NILFS_Allocated) | state;
-                tail = bh;
-                bh->b_bdev = bdev;
-                bh = bh->b_this_page;
-        } while (bh);
-        tail->b_this_page = head;
-        attach_page_buffers(page, head);
-        return page;
-}
-void nilfs_free_private_page(struct page *page)
-{
-        BUG_ON(!PageLocked(page));
-        BUG_ON(page->mapping);
-        if (page_has_buffers(page) && !try_to_free_buffers(page))
-                NILFS_PAGE_BUG(page, "failed to free page");
-        unlock_page(page);
-        __free_page(page);
-}
-/**
 * nilfs_copy_page -- copy the page with buffers
 * @dst: destination page
 * @src: source page
@@ -492,10 +425,10 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init(struct address_space *mapping,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
                        struct backing_dev_info *bdi)
 {
-        mapping->host = NULL;
+        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->assoc_mapping = NULL;
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f06b79ad7493..fb7de71605a0 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -38,14 +38,12 @@ enum {
        BH_NILFS_Redirected,
 };
-BUFFER_FNS(NILFS_Allocated, nilfs_allocated)    /* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)              /* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
 BUFFER_FNS(NILFS_Checked, nilfs_checked)        /* buffer is verified */
 BUFFER_FNS(NILFS_Redirected, nilfs_redirected)  /* redirected to a copy */
-void nilfs_mark_buffer_dirty(struct buffer_head *bh);
 int __nilfs_clear_page_dirty(struct page *);
 struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
@@ -54,14 +52,11 @@ void nilfs_forget_buffer(struct buffer_head *);
 void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
 int nilfs_page_buffers_clean(struct page *);
 void nilfs_page_bug(struct page *);
-struct page *nilfs_alloc_private_page(struct block_device *, int,
-                                      unsigned long);
-void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init(struct address_space *mapping,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
                        struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba4a64518f38..a604ac0331b2 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -387,9 +387,9 @@ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
 static void dispose_recovery_list(struct list_head *head)
 {
        while (!list_empty(head)) {
-                struct nilfs_recovery_block *rb
+                struct nilfs_recovery_block *rb;
-                        = list_entry(head->next,
-                                     struct nilfs_recovery_block, list);
+                rb = list_first_entry(head, struct nilfs_recovery_block, list);
                list_del(&rb->list);
                kfree(rb);
        }
@@ -416,9 +416,9 @@ static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
 void nilfs_dispose_segment_list(struct list_head *head)
 {
        while (!list_empty(head)) {
-                struct nilfs_segment_entry *ent
+                struct nilfs_segment_entry *ent;
-                        = list_entry(head->next,
-                                     struct nilfs_segment_entry, list);
+                ent = list_first_entry(head, struct nilfs_segment_entry, list);
                list_del(&ent->list);
                kfree(ent);
        }
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2853ff20f85a..850a7c0228fb 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -239,12 +239,15 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
                                    u32 seed)
 {
        struct nilfs_super_root *raw_sr;
+        struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
+        unsigned srsize;
        u32 crc;
        raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+        srsize = NILFS_SR_BYTES(nilfs->ns_inode_size);
        crc = crc32_le(seed,
                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
-                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+                       srsize - sizeof(raw_sr->sr_sum));
        raw_sr->sr_sum = cpu_to_le32(crc);
 }
@@ -254,18 +257,6 @@ static void nilfs_release_buffers(struct list_head *list)
        list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
                list_del_init(&bh->b_assoc_buffers);
-                if (buffer_nilfs_allocated(bh)) {
-                        struct page *clone_page = bh->b_page;
-                        /* remove clone page */
-                        brelse(bh);
-                        page_cache_release(clone_page); /* for each bh */
-                        if (page_count(clone_page) <= 2) {
-                                lock_page(clone_page);
-                                nilfs_free_private_page(clone_page);
-                        }
-                        continue;
-                }
                brelse(bh);
        }
 }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index afe4f2183454..141646e88fb5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -655,13 +655,10 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
                if (unlikely(page->index > last))
                        break;
-                if (mapping->host) {
+                lock_page(page);
-                        lock_page(page);
+                if (!page_has_buffers(page))
-                        if (!page_has_buffers(page))
+                        create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-                                create_empty_buffers(page,
+                unlock_page(page);
-                                                     1 << inode->i_blkbits, 0);
-                        unlock_page(page);
-                }
                bh = head = page_buffers(page);
                do {
@@ -809,7 +806,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
                /* The following code is duplicated with cpfile.  But, it is
                   needed to collect the checkpoint even if it was not newly
                   created */
-                nilfs_mdt_mark_buffer_dirty(bh_cp);
+                mark_buffer_dirty(bh_cp);
                nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
                nilfs_cpfile_put_checkpoint(
                        nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
@@ -889,12 +886,14 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 {
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
-        unsigned isz = nilfs->ns_inode_size;
+        unsigned isz, srsz;
        bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+        isz = nilfs->ns_inode_size;
+        srsz = NILFS_SR_BYTES(isz);
-        raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
+        raw_sr->sr_bytes = cpu_to_le16(srsz);
        raw_sr->sr_nongc_ctime
                = cpu_to_le64(nilfs_doing_gc() ?
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
@@ -906,6 +905,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
        nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
                                 NILFS_SR_SUFILE_OFFSET(isz), 1);
+        memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
 }
 static void nilfs_redirty_inodes(struct list_head *head)
@@ -954,8 +954,8 @@ static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
 dispose_buffers:
        while (!list_empty(listp)) {
-                bh = list_entry(listp->next, struct buffer_head,
+                bh = list_first_entry(listp, struct buffer_head,
-                                b_assoc_buffers);
+                                      b_assoc_buffers);
                list_del_init(&bh->b_assoc_buffers);
                brelse(bh);
        }
@@ -1500,10 +1500,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
                        nblocks = le32_to_cpu(finfo->fi_nblocks);
                        ndatablk = le32_to_cpu(finfo->fi_ndatablk);
-                        if (buffer_nilfs_node(bh))
+                        inode = bh->b_page->mapping->host;
-                                inode = NILFS_BTNC_I(bh->b_page->mapping);
-                        else
-                                inode = NILFS_AS_I(bh->b_page->mapping);
                        if (mode == SC_LSEG_DSYNC)
                                sc_op = &nilfs_sc_dsync_ops;
@@ -1556,83 +1553,24 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
        return 0;
 }
-static int
+static void nilfs_begin_page_io(struct page *page)
-nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
-{
-        struct page *clone_page;
-        struct buffer_head *bh, *head, *bh2;
-        void *kaddr;
-        bh = head = page_buffers(page);
-        clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
-        if (unlikely(!clone_page))
-                return -ENOMEM;
-        bh2 = page_buffers(clone_page);
-        kaddr = kmap_atomic(page, KM_USER0);
-        do {
-                if (list_empty(&bh->b_assoc_buffers))
-                        continue;
-                get_bh(bh2);
-                page_cache_get(clone_page); /* for each bh */
-                memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
-                bh2->b_blocknr = bh->b_blocknr;
-                list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
-                list_add_tail(&bh->b_assoc_buffers, out);
-        } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (!TestSetPageWriteback(clone_page))
-                account_page_writeback(clone_page);
-        unlock_page(clone_page);
-        return 0;
-}
-static int nilfs_test_page_to_be_frozen(struct page *page)
-{
-        struct address_space *mapping = page->mapping;
-        if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
-                return 0;
-        if (page_mapped(page)) {
-                ClearPageChecked(page);
-                return 1;
-        }
-        return PageChecked(page);
-}
-static int nilfs_begin_page_io(struct page *page, struct list_head *out)
 {
        if (!page || PageWriteback(page))
                /* For split b-tree node pages, this function may be called
                   twice.  We ignore the 2nd or later calls by this check. */
-                return 0;
+                return;
        lock_page(page);
        clear_page_dirty_for_io(page);
        set_page_writeback(page);
        unlock_page(page);
-        if (nilfs_test_page_to_be_frozen(page)) {
-                int err = nilfs_copy_replace_page_buffers(page, out);
-                if (unlikely(err))
-                        return err;
-        }
-        return 0;
 }
-static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
+static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
-                                       struct page **failed_page)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
-        struct list_head *list = &sci->sc_copied_buffers;
-        int err;
-        *failed_page = NULL;
        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
                struct buffer_head *bh;
@@ -1662,11 +1600,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
                                break;
                        }
                        if (bh->b_page != fs_page) {
-                                err = nilfs_begin_page_io(fs_page, list);
+                                nilfs_begin_page_io(fs_page);
-                                if (unlikely(err)) {
-                                        *failed_page = fs_page;
-                                        goto out;
-                                }
                                fs_page = bh->b_page;
                        }
                }
@@ -1677,11 +1611,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
                set_page_writeback(bd_page);
                unlock_page(bd_page);
        }
-        err = nilfs_begin_page_io(fs_page, list);
+        nilfs_begin_page_io(fs_page);
-        if (unlikely(err))
-                *failed_page = fs_page;
- out:
-        return err;
 }
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
@@ -1694,24 +1624,6 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
        return ret;
 }
-static void __nilfs_end_page_io(struct page *page, int err)
-{
-        if (!err) {
-                if (!nilfs_page_buffers_clean(page))
-                        __set_page_dirty_nobuffers(page);
-                ClearPageError(page);
-        } else {
-                __set_page_dirty_nobuffers(page);
-                SetPageError(page);
-        }
-        if (buffer_nilfs_allocated(page_buffers(page))) {
-                if (TestClearPageWriteback(page))
-                        dec_zone_page_state(page, NR_WRITEBACK);
-        } else
-                end_page_writeback(page);
-}
 static void nilfs_end_page_io(struct page *page, int err)
 {
        if (!page)
@@ -1738,40 +1650,19 @@ static void nilfs_end_page_io(struct page *page, int err)
                return;
        }
-        __nilfs_end_page_io(page, err);
+        if (!err) {
-}
+                if (!nilfs_page_buffers_clean(page))
+                        __set_page_dirty_nobuffers(page);
-static void nilfs_clear_copied_buffers(struct list_head *list, int err)
+                ClearPageError(page);
-{
+        } else {
-        struct buffer_head *bh, *head;
+                __set_page_dirty_nobuffers(page);
-        struct page *page;
+                SetPageError(page);
-        while (!list_empty(list)) {
-                bh = list_entry(list->next, struct buffer_head,
-                                b_assoc_buffers);
-                page = bh->b_page;
-                page_cache_get(page);
-                head = bh = page_buffers(page);
-                do {
-                        if (!list_empty(&bh->b_assoc_buffers)) {
-                                list_del_init(&bh->b_assoc_buffers);
-                                if (!err) {
-                                        set_buffer_uptodate(bh);
-                                        clear_buffer_dirty(bh);
-                                        clear_buffer_delay(bh);
-                                        clear_buffer_nilfs_volatile(bh);
-                                }
-                                brelse(bh); /* for b_assoc_buffers */
-                        }
-                } while ((bh = bh->b_this_page) != head);
-                __nilfs_end_page_io(page, err);
-                page_cache_release(page);
        }
+        end_page_writeback(page);
 }
-static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
+static void nilfs_abort_logs(struct list_head *logs, int err)
-                             int err)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
@@ -1801,8 +1692,6 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
                        }
                        if (bh->b_page != fs_page) {
                                nilfs_end_page_io(fs_page, err);
-                                if (fs_page && fs_page == failed_page)
-                                        return;
                                fs_page = bh->b_page;
                        }
                }
@@ -1821,12 +1710,11 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        list_splice_tail_init(&sci->sc_write_logs, &logs);
        ret = nilfs_wait_on_logs(&logs);
-        nilfs_abort_logs(&logs, NULL, ret ? : err);
+        nilfs_abort_logs(&logs, ret ? : err);
        list_splice_tail_init(&sci->sc_segbufs, &logs);
        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
        nilfs_free_incomplete_logs(&logs, nilfs);
-        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
        if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
@@ -1920,8 +1808,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        nilfs_end_page_io(fs_page, 0);
-        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
        nilfs_drop_collected_inodes(&sci->sc_dirty_files);
        if (nilfs_doing_gc())
@@ -1979,7 +1865,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
                                              "failed to get inode block.\n");
                                return err;
                        }
-                        nilfs_mdt_mark_buffer_dirty(ibh);
+                        mark_buffer_dirty(ibh);
                        nilfs_mdt_mark_dirty(ifile);
                        spin_lock(&nilfs->ns_inode_lock);
                        if (likely(!ii->i_bh))
@@ -1991,8 +1877,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
                clear_bit(NILFS_I_QUEUED, &ii->i_state);
                set_bit(NILFS_I_BUSY, &ii->i_state);
-                list_del(&ii->i_dirty);
+                list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
-                list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
        }
        spin_unlock(&nilfs->ns_inode_lock);
@@ -2014,8 +1899,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                clear_bit(NILFS_I_BUSY, &ii->i_state);
                brelse(ii->i_bh);
                ii->i_bh = NULL;
-                list_del(&ii->i_dirty);
+                list_move_tail(&ii->i_dirty, &ti->ti_garbage);
-                list_add_tail(&ii->i_dirty, &ti->ti_garbage);
        }
        spin_unlock(&nilfs->ns_inode_lock);
 }
@@ -2026,7 +1910,6 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 {
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct page *failed_page;
        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2081,11 +1964,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
                /* Write partial segments */
-                err = nilfs_segctor_prepare_write(sci, &failed_page);
+                nilfs_segctor_prepare_write(sci);
-                if (err) {
-                        nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
-                        goto failed_to_write;
-                }
                nilfs_add_checksums_on_logs(&sci->sc_segbufs,
                                            nilfs->ns_crc_seed);
@@ -2687,7 +2566,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
        INIT_LIST_HEAD(&sci->sc_segbufs);
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
-        INIT_LIST_HEAD(&sci->sc_copied_buffers);
        init_timer(&sci->sc_timer);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2741,8 +2619,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        if (flag || !nilfs_segctor_confirm(sci))
                nilfs_segctor_write_out(sci);
-        WARN_ON(!list_empty(&sci->sc_copied_buffers));
        if (!list_empty(&sci->sc_dirty_files)) {
                nilfs_warning(sci->sc_super, __func__,
                              "dirty file(s) after the final construction\n");
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6c02a86745fb..38a1d0013314 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -92,7 +92,6 @@ struct nilfs_segsum_pointer {
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
- * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
 * @sc_freesegs: array of segment numbers to be freed
 * @sc_nfreesegs: number of segments on @sc_freesegs
 * @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -136,7 +135,6 @@ struct nilfs_sc_info {
        struct list_head        sc_dirty_files;
        struct list_head        sc_gc_inodes;
-        struct list_head        sc_copied_buffers;
        __u64                  *sc_freesegs;
        size_t                  sc_nfreesegs;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1d6f488ccae8..0a0aba617d8a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -33,7 +33,9 @@
 struct nilfs_sufile_info {
        struct nilfs_mdt_info mi;
-        unsigned long ncleansegs;
+        unsigned long ncleansegs;/* number of clean segments */
+        __u64 allocmin;         /* lower limit of allocatable segment range */
+        __u64 allocmax;         /* upper limit of allocatable segment range */
 };
 static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
@@ -96,6 +98,13 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
                                   create, NULL, bhp);
 }
+static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile,
+                                                   __u64 segnum)
+{
+        return nilfs_mdt_delete_block(sufile,
+                                      nilfs_sufile_get_blkoff(sufile, segnum));
+}
 static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
                                     u64 ncleanadd, u64 ndirtyadd)
 {
@@ -108,7 +117,7 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
        le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
+        mark_buffer_dirty(header_bh);
 }
 /**
@@ -248,6 +257,35 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
 }
 /**
+ * nilfs_sufile_set_alloc_range - limit range of segment to be allocated
+ * @sufile: inode of segment usage file
+ * @start: minimum segment number of allocatable region (inclusive)
+ * @end: maximum segment number of allocatable region (inclusive)
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-ERANGE - invalid segment region
+ */
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
+{
+        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+        __u64 nsegs;
+        int ret = -ERANGE;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        nsegs = nilfs_sufile_get_nsegments(sufile);
+        if (start <= end && end < nsegs) {
+                sui->allocmin = start;
+                sui->allocmax = end;
+                ret = 0;
+        }
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
 * nilfs_sufile_alloc - allocate a segment
 * @sufile: inode of segment usage file
 * @segnump: pointer to segment number
@@ -269,11 +307,12 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        struct buffer_head *header_bh, *su_bh;
        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
+        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        __u64 segnum, maxsegnum, last_alloc;
        void *kaddr;
-        unsigned long nsegments, ncleansegs, nsus;
+        unsigned long nsegments, ncleansegs, nsus, cnt;
-        int ret, i, j;
+        int ret, j;
        down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -287,13 +326,31 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        kunmap_atomic(kaddr, KM_USER0);
        nsegments = nilfs_sufile_get_nsegments(sufile);
+        maxsegnum = sui->allocmax;
        segnum = last_alloc + 1;
-        maxsegnum = nsegments - 1;
+        if (segnum < sui->allocmin || segnum > sui->allocmax)
-        for (i = 0; i < nsegments; i += nsus) {
+                segnum = sui->allocmin;
-                if (segnum >= nsegments) {
-                        /* wrap around */
+        for (cnt = 0; cnt < nsegments; cnt += nsus) {
-                        segnum = 0;
+                if (segnum > maxsegnum) {
-                        maxsegnum = last_alloc;
+                        if (cnt < sui->allocmax - sui->allocmin + 1) {
+                                /*
+                                 * wrap around in the limited region.
+                                 * if allocation started from
+                                 * sui->allocmin, this never happens.
+                                 */
+                                segnum = sui->allocmin;
+                                maxsegnum = last_alloc;
+                        } else if (segnum > sui->allocmin &&
+                                   sui->allocmax + 1 < nsegments) {
+                                segnum = sui->allocmax + 1;
+                                maxsegnum = nsegments - 1;
+                        } else if (sui->allocmin > 0)  {
+                                segnum = 0;
+                                maxsegnum = sui->allocmin - 1;
+                        } else {
+                                break; /* never happens */
+                        }
                }
                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
                                                           &su_bh);
@@ -319,9 +376,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
                        header->sh_last_alloc = cpu_to_le64(segnum);
                        kunmap_atomic(kaddr, KM_USER0);
-                        NILFS_SUI(sufile)->ncleansegs--;
+                        sui->ncleansegs--;
-                        nilfs_mdt_mark_buffer_dirty(header_bh);
+                        mark_buffer_dirty(header_bh);
-                        nilfs_mdt_mark_buffer_dirty(su_bh);
+                        mark_buffer_dirty(su_bh);
                        nilfs_mdt_mark_dirty(sufile);
                        brelse(su_bh);
                        *segnump = segnum;
@@ -364,7 +421,7 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
        nilfs_sufile_mod_counter(header_bh, -1, 1);
        NILFS_SUI(sufile)->ncleansegs--;
-        nilfs_mdt_mark_buffer_dirty(su_bh);
+        mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -395,7 +452,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
        nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
        NILFS_SUI(sufile)->ncleansegs -= clean;
-        nilfs_mdt_mark_buffer_dirty(su_bh);
+        mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -421,7 +478,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
        sudirty = nilfs_segment_usage_dirty(su);
        nilfs_segment_usage_set_clean(su);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(su_bh);
+        mark_buffer_dirty(su_bh);
        nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
        NILFS_SUI(sufile)->ncleansegs++;
@@ -441,7 +498,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
        if (!ret) {
-                nilfs_mdt_mark_buffer_dirty(bh);
+                mark_buffer_dirty(bh);
                nilfs_mdt_mark_dirty(sufile);
                brelse(bh);
        }
@@ -476,7 +533,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
        su->su_nblocks = cpu_to_le32(nblocks);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(bh);
+        mark_buffer_dirty(bh);
        nilfs_mdt_mark_dirty(sufile);
        brelse(bh);
@@ -505,7 +562,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
-        struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
+        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        void *kaddr;
        int ret;
@@ -555,11 +612,183 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
                nilfs_sufile_mod_counter(header_bh, -1, 0);
                NILFS_SUI(sufile)->ncleansegs--;
        }
-        nilfs_mdt_mark_buffer_dirty(su_bh);
+        mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
 /**
+  * nilfs_sufile_truncate_range - truncate range of segment array
+  * @sufile: inode of segment usage file
+  * @start: start segment number (inclusive)
+  * @end: end segment number (inclusive)
+  *
+  * Return Value: On success, 0 is returned.  On error, one of the
+  * following negative error codes is returned.
+  *
+  * %-EIO - I/O error.
+  *
+  * %-ENOMEM - Insufficient amount of memory available.
+  *
+  * %-EINVAL - Invalid number of segments specified
+  *
+  * %-EBUSY - Dirty or active segments are present in the range
+  */
+static int nilfs_sufile_truncate_range(struct inode *sufile,
+                                       __u64 start, __u64 end)
+{
+        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+        struct buffer_head *header_bh;
+        struct buffer_head *su_bh;
+        struct nilfs_segment_usage *su, *su2;
+        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+        unsigned long segusages_per_block;
+        unsigned long nsegs, ncleaned;
+        __u64 segnum;
+        void *kaddr;
+        ssize_t n, nc;
+        int ret;
+        int j;
+        nsegs = nilfs_sufile_get_nsegments(sufile);
+        ret = -EINVAL;
+        if (start > end || start >= nsegs)
+                goto out;
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out;
+        segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+        ncleaned = 0;
+        for (segnum = start; segnum <= end; segnum += n) {
+                n = min_t(unsigned long,
+                          segusages_per_block -
+                                  nilfs_sufile_get_offset(sufile, segnum),
+                          end - segnum + 1);
+                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+                                                           &su_bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out_header;
+                        /* hole */
+                        continue;
+                }
+                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+                su = nilfs_sufile_block_get_segment_usage(
+                        sufile, segnum, su_bh, kaddr);
+                su2 = su;
+                for (j = 0; j < n; j++, su = (void *)su + susz) {
+                        if ((le32_to_cpu(su->su_flags) &
+                             ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+                            nilfs_segment_is_active(nilfs, segnum + j)) {
+                                ret = -EBUSY;
+                                kunmap_atomic(kaddr, KM_USER0);
+                                brelse(su_bh);
+                                goto out_header;
+                        }
+                }
+                nc = 0;
+                for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) {
+                        if (nilfs_segment_usage_error(su)) {
+                                nilfs_segment_usage_set_clean(su);
+                                nc++;
+                        }
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                if (nc > 0) {
+                        mark_buffer_dirty(su_bh);
+                        ncleaned += nc;
+                }
+                brelse(su_bh);
+                if (n == segusages_per_block) {
+                        /* make hole */
+                        nilfs_sufile_delete_segment_usage_block(sufile, segnum);
+                }
+        }
+        ret = 0;
+out_header:
+        if (ncleaned > 0) {
+                NILFS_SUI(sufile)->ncleansegs += ncleaned;
+                nilfs_sufile_mod_counter(header_bh, ncleaned, 0);
+                nilfs_mdt_mark_dirty(sufile);
+        }
+        brelse(header_bh);
+out:
+        return ret;
+}
+/**
+ * nilfs_sufile_resize - resize segment array
+ * @sufile: inode of segment usage file
+ * @newnsegs: new number of segments
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - Enough free space is not left for shrinking
+ *
+ * %-EBUSY - Dirty or active segments exist in the region to be truncated
+ */
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
+{
+        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+        struct buffer_head *header_bh;
+        struct nilfs_sufile_header *header;
+        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+        void *kaddr;
+        unsigned long nsegs, nrsvsegs;
+        int ret = 0;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        nsegs = nilfs_sufile_get_nsegments(sufile);
+        if (nsegs == newnsegs)
+                goto out;
+        ret = -ENOSPC;
+        nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs);
+        if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs)
+                goto out;
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out;
+        if (newnsegs > nsegs) {
+                sui->ncleansegs += newnsegs - nsegs;
+        } else /* newnsegs < nsegs */ {
+                ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1);
+                if (ret < 0)
+                        goto out_header;
+                sui->ncleansegs -= nsegs - newnsegs;
+        }
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = kaddr + bh_offset(header_bh);
+        header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
+        kunmap_atomic(kaddr, KM_USER0);
+        mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_dirty(sufile);
+        nilfs_set_nsegments(nilfs, newnsegs);
+out_header:
+        brelse(header_bh);
+out:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
 * nilfs_sufile_get_suinfo -
 * @sufile: inode of segment usage file
 * @segnum: segment number to start looking
@@ -583,7 +812,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
        struct nilfs_segment_usage *su;
        struct nilfs_suinfo *si = buf;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
-        struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
+        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        void *kaddr;
        unsigned long nsegs, segusages_per_block;
        ssize_t n;
@@ -679,6 +908,9 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
        kunmap_atomic(kaddr, KM_USER0);
        brelse(header_bh);
+        sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
+        sui->allocmin = 0;
        unlock_new_inode(sufile);
 out:
        *inodep = sufile;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a943fbacb45b..e84bc5b51fc1 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,11 +31,12 @@
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
-        return NILFS_I_NILFS(sufile)->ns_nsegments;
+        return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments;
 }
 unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end);
 int nilfs_sufile_alloc(struct inode *, __u64 *);
 int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
 int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
@@ -61,6 +62,7 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
 int nilfs_sufile_read(struct super_block *sb, size_t susize,
                      struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 062cca065195..8351c44a7320 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -56,6 +56,7 @@
 #include "btnode.h"
 #include "page.h"
 #include "cpfile.h"
+#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */
 #include "ifile.h"
 #include "dat.h"
 #include "segment.h"
@@ -165,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        ii->i_state = 0;
        ii->i_cno = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
+        nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
        return &ii->vfs_inode;
 }
@@ -347,6 +348,134 @@ int nilfs_cleanup_super(struct super_block *sb)
        return ret;
 }
+/**
+ * nilfs_move_2nd_super - relocate secondary super block
+ * @sb: super block instance
+ * @sb2off: new offset of the secondary super block (in bytes)
+ */
+static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
+{
+        struct the_nilfs *nilfs = sb->s_fs_info;
+        struct buffer_head *nsbh;
+        struct nilfs_super_block *nsbp;
+        sector_t blocknr, newblocknr;
+        unsigned long offset;
+        int sb2i = -1;  /* array index of the secondary superblock */
+        int ret = 0;
+        /* nilfs->ns_sem must be locked by the caller. */
+        if (nilfs->ns_sbh[1] &&
+            nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) {
+                sb2i = 1;
+                blocknr = nilfs->ns_sbh[1]->b_blocknr;
+        } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
+                sb2i = 0;
+                blocknr = nilfs->ns_sbh[0]->b_blocknr;
+        }
+        if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
+                goto out;  /* super block location is unchanged */
+        /* Get new super block buffer */
+        newblocknr = sb2off >> nilfs->ns_blocksize_bits;
+        offset = sb2off & (nilfs->ns_blocksize - 1);
+        nsbh = sb_getblk(sb, newblocknr);
+        if (!nsbh) {
+                printk(KERN_WARNING
+                       "NILFS warning: unable to move secondary superblock "
+                       "to block %llu\n", (unsigned long long)newblocknr);
+                ret = -EIO;
+                goto out;
+        }
+        nsbp = (void *)nsbh->b_data + offset;
+        memset(nsbp, 0, nilfs->ns_blocksize);
+        if (sb2i >= 0) {
+                memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+                brelse(nilfs->ns_sbh[sb2i]);
+                nilfs->ns_sbh[sb2i] = nsbh;
+                nilfs->ns_sbp[sb2i] = nsbp;
+        } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) {
+                /* secondary super block will be restored to index 1 */
+                nilfs->ns_sbh[1] = nsbh;
+                nilfs->ns_sbp[1] = nsbp;
+        } else {
+                brelse(nsbh);
+        }
+out:
+        return ret;
+}
+/**
+ * nilfs_resize_fs - resize the filesystem
+ * @sb: super block instance
+ * @newsize: new size of the filesystem (in bytes)
+ */
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
+{
+        struct the_nilfs *nilfs = sb->s_fs_info;
+        struct nilfs_super_block **sbp;
+        __u64 devsize, newnsegs;
+        loff_t sb2off;
+        int ret;
+        ret = -ERANGE;
+        devsize = i_size_read(sb->s_bdev->bd_inode);
+        if (newsize > devsize)
+                goto out;
+        /*
+         * Write lock is required to protect some functions depending
+         * on the number of segments, the number of reserved segments,
+         * and so forth.
+         */
+        down_write(&nilfs->ns_segctor_sem);
+        sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
+        newnsegs = sb2off >> nilfs->ns_blocksize_bits;
+        do_div(newnsegs, nilfs->ns_blocks_per_segment);
+        ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
+        up_write(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                goto out;
+        ret = nilfs_construct_segment(sb);
+        if (ret < 0)
+                goto out;
+        down_write(&nilfs->ns_sem);
+        nilfs_move_2nd_super(sb, sb2off);
+        ret = -EIO;
+        sbp = nilfs_prepare_super(sb, 0);
+        if (likely(sbp)) {
+                nilfs_set_log_cursor(sbp[0], nilfs);
+                /*
+                 * Drop NILFS_RESIZE_FS flag for compatibility with
+                 * mount-time resize which may be implemented in a
+                 * future release.
+                 */
+                sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) &
+                                              ~NILFS_RESIZE_FS);
+                sbp[0]->s_dev_size = cpu_to_le64(newsize);
+                sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments);
+                if (sbp[1])
+                        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+                ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+        }
+        up_write(&nilfs->ns_sem);
+        /*
+         * Reset the range of allocatable segments last.  This order
+         * is important in the case of expansion because the secondary
+         * superblock must be protected from log write until migration
+         * completes.
+         */
+        if (!ret)
+                nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1);
+out:
+        return ret;
+}
 static void nilfs_put_super(struct super_block *sb)
 {
        struct the_nilfs *nilfs = sb->s_fs_info;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d2acd1a651f3..d32714094375 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -363,6 +363,24 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
        return res;
 }
+/**
+ * nilfs_nrsvsegs - calculate the number of reserved segments
+ * @nilfs: nilfs object
+ * @nsegs: total number of segments
+ */
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+        return max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+                     DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage,
+                                  100));
+}
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+        nilfs->ns_nsegments = nsegs;
+        nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs);
+}
 static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
                                   struct nilfs_super_block *sbp)
 {
@@ -389,13 +407,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
        }
        nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
-        nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
        nilfs->ns_r_segments_percentage =
                le32_to_cpu(sbp->s_r_segments_percentage);
-        nilfs->ns_nrsvsegs =
+        nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
-                max_t(unsigned long, NILFS_MIN_NRSVSEGS,
-                      DIV_ROUND_UP(nilfs->ns_nsegments *
-                                   nilfs->ns_r_segments_percentage, 100));
        nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
        return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f4968145c2a3..9992b11312ff 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -268,6 +268,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev);
 void destroy_nilfs(struct the_nilfs *nilfs);
 int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
 int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e6..f17e58b32989 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
        namei.o                 \
        refcounttree.o          \
        reservations.o          \
+        move_extents.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7401c7..ed553c60de82 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 #include <cluster/masklog.h>
@@ -7184,3 +7185,168 @@ out_commit:
 out:
        return ret;
 }
+static int ocfs2_trim_extent(struct super_block *sb,
+                             struct ocfs2_group_desc *gd,
+                             u32 start, u32 count)
+{
+        u64 discard, bcount;
+        bcount = ocfs2_clusters_to_blocks(sb, count);
+        discard = le64_to_cpu(gd->bg_blkno) +
+                        ocfs2_clusters_to_blocks(sb, start);
+        trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+        return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+static int ocfs2_trim_group(struct super_block *sb,
+                            struct ocfs2_group_desc *gd,
+                            u32 start, u32 max, u32 minbits)
+{
+        int ret = 0, count = 0, next;
+        void *bitmap = gd->bg_bitmap;
+        if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+                return 0;
+        trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+                               start, max, minbits);
+        while (start < max) {
+                start = ocfs2_find_next_zero_bit(bitmap, max, start);
+                if (start >= max)
+                        break;
+                next = ocfs2_find_next_bit(bitmap, max, start);
+                if ((next - start) >= minbits) {
+                        ret = ocfs2_trim_extent(sb, gd,
+                                                start, next - start);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                break;
+                        }
+                        count += next - start;
+                }
+                start = next + 1;
+                if (fatal_signal_pending(current)) {
+                        count = -ERESTARTSYS;
+                        break;
+                }
+                if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+                        break;
+        }
+        if (ret < 0)
+                count = ret;
+        return count;
+}
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        u64 start, len, trimmed, first_group, last_group, group;
+        int ret, cnt;
+        u32 first_bit, last_bit, minlen;
+        struct buffer_head *main_bm_bh = NULL;
+        struct inode *main_bm_inode = NULL;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_dinode *main_bm;
+        struct ocfs2_group_desc *gd = NULL;
+        start = range->start >> osb->s_clustersize_bits;
+        len = range->len >> osb->s_clustersize_bits;
+        minlen = range->minlen >> osb->s_clustersize_bits;
+        trimmed = 0;
+        if (!len) {
+                range->len = 0;
+                return 0;
+        }
+        if (minlen >= osb->bitmap_cpg)
+                return -EINVAL;
+        main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                    GLOBAL_BITMAP_SYSTEM_INODE,
+                                                    OCFS2_INVALID_SLOT);
+        if (!main_bm_inode) {
+                ret = -EIO;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&main_bm_inode->i_mutex);
+        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        if (start >= le32_to_cpu(main_bm->i_clusters)) {
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (start + len > le32_to_cpu(main_bm->i_clusters))
+                len = le32_to_cpu(main_bm->i_clusters) - start;
+        trace_ocfs2_trim_fs(start, len, minlen);
+        /* Determine first and last group to examine based on start and len */
+        first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+        if (first_group == osb->first_cluster_group_blkno)
+                first_bit = start;
+        else
+                first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+        last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+        last_bit = osb->bitmap_cpg;
+        for (group = first_group; group <= last_group;) {
+                if (first_bit + len >= osb->bitmap_cpg)
+                        last_bit = osb->bitmap_cpg;
+                else
+                        last_bit = first_bit + len;
+                ret = ocfs2_read_group_descriptor(main_bm_inode,
+                                                  main_bm, group,
+                                                  &gd_bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+                gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+                cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+                brelse(gd_bh);
+                gd_bh = NULL;
+                if (cnt < 0) {
+                        ret = cnt;
+                        mlog_errno(ret);
+                        break;
+                }
+                trimmed += cnt;
+                len -= osb->bitmap_cpg - first_bit;
+                first_bit = 0;
+                if (group == osb->first_cluster_group_blkno)
+                        group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+                else
+                        group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+        }
+        range->len = trimmed * sb->s_blocksize;
+out_unlock:
+        ocfs2_inode_unlock(main_bm_inode, 0);
+        brelse(main_bm_bh);
+out_mutex:
+        mutex_unlock(&main_bm_inode->i_mutex);
+        iput(main_bm_inode);
+out:
+        return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a03251c..ca381c584127 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
                    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
 * Helper function to look at the # of clusters in an extent record.
 */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702dab5d1f..a4b07730b2e1 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
-        sysfs_remove_link(NULL, "o2cb");
        kset_unregister(o2cb_kset);
 }
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
        if (!o2cb_kset)
                return -ENOMEM;
-        /*
-         * Create this symlink for backwards compatibility with old
-         * versions of ocfs2-tools which look for things in /sys/o2cb.
-         */
-        ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
-        if (ret)
-                goto error;
        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
                goto error;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4bdf7baee344..d602abb51b61 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -144,6 +144,7 @@ struct dlm_ctxt
        wait_queue_head_t dlm_join_events;
        unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
        return 1;
 }
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+        if (idx == DLM_GRANTED_LIST)
+                return "granted";
+        else if (idx == DLM_CONVERTING_LIST)
+                return "converting";
+        else if (idx == DLM_BLOCKED_LIST)
+                return "blocked";
+        else
+                return "unknown";
+}
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -448,6 +461,7 @@ enum {
        DLM_FINALIZE_RECO_MSG           = 518,
        DLM_QUERY_REGION                = 519,
        DLM_QUERY_NODEINFO              = 520,
+        DLM_BEGIN_EXIT_DOMAIN_MSG       = 521,
 };
 struct dlm_reco_node_data
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 04a32be0aeb9..56f82cb912e3 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
                                 buf + out, len - out);
        out += snprintf(buf + out, len - out, "\n");
+        /* Exit Domain Map: xx xx xx */
+        out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+        out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
        /* Live Map: xx xx xx */
        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3b179d6cbde0..6ed6b95dcf93 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 * New in version 1.1:
 *      - Message DLM_QUERY_REGION added to support global heartbeat
 *      - Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ *      - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
 */
 static const struct dlm_protocol_version dlm_protocol = {
        .pv_major = 1,
-        .pv_minor = 1,
+        .pv_minor = 2,
 };
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
                        dropped = dlm_empty_lockres(dlm, res);
                        spin_lock(&res->spinlock);
-                        __dlm_lockres_calc_usage(dlm, res);
+                        if (dropped)
-                        iter = res->hash_node.next;
+                                __dlm_lockres_calc_usage(dlm, res);
+                        else
+                                iter = res->hash_node.next;
                        spin_unlock(&res->spinlock);
                        dlm_lockres_put(res);
-                        if (dropped)
+                        if (dropped) {
+                                cond_resched_lock(&dlm->spinlock);
                                goto redo_bucket;
+                        }
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
        return ret;
 }
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+                                         void *data, void **ret_data)
+{
+        struct dlm_ctxt *dlm = data;
+        unsigned int node;
+        struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+        if (!dlm_grab(dlm))
+                return 0;
+        node = exit_msg->node_idx;
+        mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+        spin_lock(&dlm->spinlock);
+        set_bit(node, dlm->exit_domain_map);
+        spin_unlock(&dlm->spinlock);
+        dlm_put(dlm);
+        return 0;
+}
 static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
 {
        /* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
+        clear_bit(node, dlm->exit_domain_map);
        __dlm_print_nodes(dlm);
        /* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        return 0;
 }
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
                                    unsigned int node)
 {
        int status;
        struct dlm_exit_domain leave_msg;
-        mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
+        mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
-                  node, dlm->name, dlm->node_num);
+             msg_type, node);
        memset(&leave_msg, 0, sizeof(leave_msg));
        leave_msg.node_idx = dlm->node_num;
-        status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
+        status = o2net_send_message(msg_type, dlm->key, &leave_msg,
-                                    &leave_msg, sizeof(leave_msg), node,
+                                    sizeof(leave_msg), node, NULL);
-                                    NULL);
        if (status < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "Error %d sending domain exit message %u "
-                     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
+                     "to node %u on domain %s\n", status, msg_type, node,
-        mlog(0, "status return %d from o2net_send_message\n", status);
+                     dlm->name);
        return status;
 }
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+        int node = -1;
+        /* Support for begin exit domain was added in 1.2 */
+        if (dlm->dlm_locking_proto.pv_major == 1 &&
+            dlm->dlm_locking_proto.pv_minor < 2)
+                return;
+        /*
+         * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+         * informational. Meaning if a node does not receive the message,
+         * so be it.
+         */
+        spin_lock(&dlm->spinlock);
+        while (1) {
+                node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+                if (node >= O2NM_MAX_NODES)
+                        break;
+                if (node == dlm->node_num)
+                        continue;
+                spin_unlock(&dlm->spinlock);
+                dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+                spin_lock(&dlm->spinlock);
+        }
+        spin_unlock(&dlm->spinlock);
+}
 static void dlm_leave_domain(struct dlm_ctxt *dlm)
 {
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
                clear_node = 1;
-                status = dlm_send_one_domain_exit(dlm, node);
+                status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+                                                  node);
                if (status < 0 &&
                    status != -ENOPROTOOPT &&
                    status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
        if (leave) {
                mlog(0, "shutting down domain %s\n", dlm->name);
+                dlm_begin_exit_domain(dlm);
                /* We changed dlm state, notify the thread */
                dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                 * leftover join state. */
                BUG_ON(dlm->joining_node != assert->node_idx);
                set_bit(assert->node_idx, dlm->domain_map);
+                clear_bit(assert->node_idx, dlm->exit_domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        if (status)
                goto bail;
+        status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+                                        sizeof(struct dlm_exit_domain),
+                                        dlm_begin_exit_domain_handler,
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
+        if (status)
+                goto bail;
 bail:
        if (status)
                dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84d166328cf7..11eefb8c12e9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
        dlm_lockres_put(res);
 }
-/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
+/*
- * if not. If 0, numlocks is set to the number of locks in the lockres.
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
 */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
-                                      struct dlm_lock_resource *res,
+                                      struct dlm_lock_resource *res)
-                                      int *numlocks,
-                                      int *hasrefs)
 {
-        int ret;
+        enum dlm_lockres_list idx;
-        int i;
+        int nonlocal = 0, node_ref;
-        int count = 0;
        struct list_head *queue;
        struct dlm_lock *lock;
+        u64 cookie;
        assert_spin_locked(&res->spinlock);
-        *numlocks = 0;
+        if (res->owner != dlm->node_num)
-        *hasrefs = 0;
+                return 0;
-        ret = -EINVAL;
-        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
-                mlog(0, "cannot migrate lockres with unknown owner!\n");
-                goto leave;
-        }
-        if (res->owner != dlm->node_num) {
-                mlog(0, "cannot migrate lockres this node doesn't own!\n");
-                goto leave;
-        }
-        ret = 0;
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
-        queue = &res->granted;
+                queue = dlm_list_idx_to_ptr(res, idx);
-        for (i = 0; i < 3; i++) {
                list_for_each_entry(lock, queue, list) {
-                        ++count;
+                        if (lock->ml.node != dlm->node_num) {
-                        if (lock->ml.node == dlm->node_num) {
+                                nonlocal++;
-                                mlog(0, "found a lock owned by this node still "
+                                continue;
-                                     "on the %s queue!  will not migrate this "
-                                     "lockres\n", (i == 0 ? "granted" :
-                                                   (i == 1 ? "converting" :
-                                                    "blocked")));
-                                ret = -ENOTEMPTY;
-                                goto leave;
                        }
+                        cookie = be64_to_cpu(lock->ml.cookie);
+                        mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+                             "%s list\n", dlm->name, res->lockname.len,
+                             res->lockname.name,
+                             dlm_get_lock_cookie_node(cookie),
+                             dlm_get_lock_cookie_seq(cookie),
+                             dlm_list_in_text(idx));
+                        return 0;
                }
-                queue++;
        }
-        *numlocks = count;
+        if (!nonlocal) {
+                node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-        count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+                if (node_ref >= O2NM_MAX_NODES)
-        if (count < O2NM_MAX_NODES)
+                        return 0;
-                *hasrefs = 1;
+        }
-        mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+        mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
-             res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
+             res->lockname.name);
-leave:
+        return 1;
-        return ret;
 }
 /*
@@ -2406,8 +2396,7 @@ leave:
 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-                               struct dlm_lock_resource *res,
+                               struct dlm_lock_resource *res, u8 target)
-                               u8 target)
 {
        struct dlm_master_list_entry *mle = NULL;
        struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        int numlocks, hasrefs;
        int wake = 0;
        if (!dlm_grab(dlm))
                return -EINVAL;
+        BUG_ON(target == O2NM_MAX_NODES);
        name = res->lockname.name;
        namelen = res->lockname.len;
-        mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
+        mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+             target);
-        /*
-         * ensure this lockres is a proper candidate for migration
-         */
-        spin_lock(&res->spinlock);
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret < 0) {
-                spin_unlock(&res->spinlock);
-                goto leave;
-        }
-        spin_unlock(&res->spinlock);
-        /* no work to do */
-        if (numlocks == 0 && !hasrefs)
-                goto leave;
-        /*
-         * preallocate up front
-         * if this fails, abort
-         */
+        /* preallocate up front. if this fails, abort */
        ret = -ENOMEM;
        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
        if (!mres) {
@@ -2462,35 +2434,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        ret = 0;
        /*
-         * find a node to migrate the lockres to
-         */
-        spin_lock(&dlm->spinlock);
-        /* pick a new node */
-        if (!test_bit(target, dlm->domain_map) ||
-            target >= O2NM_MAX_NODES) {
-                target = dlm_pick_migration_target(dlm, res);
-        }
-        mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
-             namelen, name, target);
-        if (target >= O2NM_MAX_NODES ||
-            !test_bit(target, dlm->domain_map)) {
-                /* target chosen is not alive */
-                ret = -EINVAL;
-        }
-        if (ret) {
-                spin_unlock(&dlm->spinlock);
-                goto fail;
-        }
-        mlog(0, "continuing with target = %u\n", target);
-        /*
         * clear any existing master requests and
         * add the migration mle to the list
         */
+        spin_lock(&dlm->spinlock);
        spin_lock(&dlm->master_lock);
        ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
                                    namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
                        dlm_put_mle(mle);
                } else if (mle) {
                        kmem_cache_free(dlm_mle_cache, mle);
+                        mle = NULL;
                }
                goto leave;
        }
@@ -2652,69 +2600,52 @@ leave:
        if (wake)
                wake_up(&res->wq);
-        /* TODO: cleanup */
        if (mres)
                free_page((unsigned long)mres);
        dlm_put(dlm);
-        mlog(0, "returning %d\n", ret);
+        mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+             name, target, ret);
        return ret;
 }
 #define DLM_MIGRATION_RETRY_MS  100
-/* Should be called only after beginning the domain leave process.
+/*
+ * Should be called only after beginning the domain leave process.
 * There should not be any remaining locks on nonlocal lock resources,
 * and there should be no local locks left on locally mastered resources.
 *
 * Called with the dlm spinlock held, may drop it to do migration, but
 * will re-acquire before exit.
 *
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-        int numlocks, hasrefs;
+        u8 target = O2NM_MAX_NODES;
+        assert_spin_locked(&dlm->spinlock);
        spin_lock(&res->spinlock);
-        if (res->owner != dlm->node_num) {
+        if (dlm_is_lockres_migrateable(dlm, res))
-                if (!__dlm_lockres_unused(res)) {
+                target = dlm_pick_migration_target(dlm, res);
-                        mlog(ML_ERROR, "%s:%.*s: this node is not master, "
+        spin_unlock(&res->spinlock);
-                             "trying to free this but locks remain\n",
-                             dlm->name, res->lockname.len, res->lockname.name);
-                }
-                spin_unlock(&res->spinlock);
-                goto leave;
-        }
-        /* No need to migrate a lockres having no locks */
+        if (target == O2NM_MAX_NODES)
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret >= 0 && numlocks == 0 && !hasrefs) {
-                spin_unlock(&res->spinlock);
                goto leave;
-        }
-        spin_unlock(&res->spinlock);
        /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
        spin_unlock(&dlm->spinlock);
        lock_dropped = 1;
-        while (1) {
+        ret = dlm_migrate_lockres(dlm, res, target);
-                ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
+        if (ret)
-                if (ret >= 0)
+                mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
-                        break;
+                     dlm->name, res->lockname.len, res->lockname.name,
-                if (ret == -ENOTEMPTY) {
+                     target, ret);
-                        mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
-                                res->lockname.len, res->lockname.name);
-                        BUG();
-                }
-                mlog(0, "lockres %.*s: migrate failed, "
-                     "retrying\n", res->lockname.len,
-                     res->lockname.name);
-                msleep(DLM_MIGRATION_RETRY_MS);
-        }
        spin_lock(&dlm->spinlock);
 leave:
        return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
        }
 }
-/* for now this is not too intelligent.  we will
+/*
- * need stats to make this do the right thing.
+ * Pick a node to migrate the lock resource to. This function selects a
- * this just finds the first lock on one of the
+ * potential target based first on the locks and then on refmap. It skips
- * queues and uses that node as the target. */
+ * nodes that are in the process of exiting the domain.
+ */
 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                                    struct dlm_lock_resource *res)
 {
-        int i;
+        enum dlm_lockres_list idx;
        struct list_head *queue = &res->granted;
        struct dlm_lock *lock;
-        int nodenum;
+        int noderef;
+        u8 nodenum = O2NM_MAX_NODES;
        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&res->spinlock);
-        spin_lock(&res->spinlock);
+        /* Go through all the locks */
-        for (i=0; i<3; i++) {
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+                queue = dlm_list_idx_to_ptr(res, idx);
                list_for_each_entry(lock, queue, list) {
-                        /* up to the caller to make sure this node
+                        if (lock->ml.node == dlm->node_num)
-                         * is alive */
+                                continue;
-                        if (lock->ml.node != dlm->node_num) {
+                        if (test_bit(lock->ml.node, dlm->exit_domain_map))
-                                spin_unlock(&res->spinlock);
+                                continue;
-                                return lock->ml.node;
+                        nodenum = lock->ml.node;
-                        }
+                        goto bail;
                }
-                queue++;
-        }
-        nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-        if (nodenum < O2NM_MAX_NODES) {
-                spin_unlock(&res->spinlock);
-                return nodenum;
        }
-        spin_unlock(&res->spinlock);
-        mlog(0, "have not found a suitable target yet! checking domain map\n");
-        /* ok now we're getting desperate.  pick anyone alive. */
+        /* Go thru the refmap */
-        nodenum = -1;
+        noderef = -1;
        while (1) {
-                nodenum = find_next_bit(dlm->domain_map,
+                noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
-                                        O2NM_MAX_NODES, nodenum+1);
+                                        noderef + 1);
-                mlog(0, "found %d in domain map\n", nodenum);
+                if (noderef >= O2NM_MAX_NODES)
-                if (nodenum >= O2NM_MAX_NODES)
                        break;
-                if (nodenum != dlm->node_num) {
+                if (noderef == dlm->node_num)
-                        mlog(0, "picking %d\n", nodenum);
+                        continue;
-                        return nodenum;
+                if (test_bit(noderef, dlm->exit_domain_map))
-                }
+                        continue;
+                nodenum = noderef;
+                goto bail;
        }
-        mlog(0, "giving up.  no master to migrate to\n");
+bail:
-        return DLM_LOCK_RES_OWNER_UNKNOWN;
+        return nodenum;
 }
 /* this is called by the new master once all lockres
 * data has been received */
 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f1beb6fc254d..7efab6d28a21 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
        mlog(0, "node %u being removed from domain map!\n", idx);
        clear_bit(idx, dlm->domain_map);
+        clear_bit(idx, dlm->exit_domain_map);
        /* wake up migration waiters if a node goes down.
         * perhaps later we can genericize this for other waiters. */
        wake_up(&dlm->migration_wq);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8c5c0eddc365..b42076797049 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
 *                signifies a bast fired on the lock.
 */
 #define DLMFS_CAPABILITIES "bast stackglue"
-extern int param_set_dlmfs_capabilities(const char *val,
+static int param_set_dlmfs_capabilities(const char *val,
                                        struct kernel_param *kp)
 {
        printk(KERN_ERR "%s: readonly parameter\n", kp->name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89659d6dc206..b1e35a392ca5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+        .fallocate      = ocfs2_fallocate,
 };
 const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c5989eae..bc91072b7219 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
 #include "ioctl.h"
 #include "resize.h"
 #include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
 #include <linux/ext2_fs.h>
@@ -35,31 +40,27 @@
 * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
 * just a best-effort to tell userspace that this request caused the error.
 */
-static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
                                        struct ocfs2_info_request __user *req)
 {
        kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
        (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
 }
-#define o2info_set_request_error(a, b) \
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
-                __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
-static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags |= OCFS2_INFO_FL_FILLED;
 }
-#define o2info_set_request_filled(a) \
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
-                __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
-static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
 }
-#define o2info_clear_request_filled(a) \
+static inline int o2info_coherent(struct ocfs2_info_request *req)
-                __o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
+{
+        return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
        oib.ib_blocksize = inode->i_sb->s_blocksize;
-        o2info_set_request_filled(oib);
+        o2info_set_request_filled(&oib.ib_req);
        if (o2info_to_user(oib, req))
                goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oib, req);
+                o2info_set_request_error(&oib.ib_req, req);
        return status;
 }
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
        oic.ic_clustersize = osb->s_clustersize;
-        o2info_set_request_filled(oic);
+        o2info_set_request_filled(&oic.ic_req);
        if (o2info_to_user(oic, req))
                goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oic, req);
+                o2info_set_request_error(&oic.ic_req, req);
        return status;
 }
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
        oim.im_max_slots = osb->max_slots;
-        o2info_set_request_filled(oim);
+        o2info_set_request_filled(&oim.im_req);
        if (o2info_to_user(oim, req))
                goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oim, req);
+                o2info_set_request_error(&oim.im_req, req);
        return status;
 }
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
-        o2info_set_request_filled(oil);
+        o2info_set_request_filled(&oil.il_req);
        if (o2info_to_user(oil, req))
                goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oil, req);
+                o2info_set_request_error(&oil.il_req, req);
        return status;
 }
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
-        o2info_set_request_filled(oiu);
+        o2info_set_request_filled(&oiu.iu_req);
        if (o2info_to_user(oiu, req))
                goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oiu, req);
+                o2info_set_request_error(&oiu.iu_req, req);
        return status;
 }
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        oif.if_incompat_features = osb->s_feature_incompat;
        oif.if_ro_compat_features = osb->s_feature_ro_compat;
-        o2info_set_request_filled(oif);
+        o2info_set_request_filled(&oif.if_req);
        if (o2info_to_user(oif, req))
                goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oif, req);
+                o2info_set_request_error(&oif.if_req, req);
        return status;
 }
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        oij.ij_journal_size = osb->journal->j_inode->i_size;
-        o2info_set_request_filled(oij);
+        o2info_set_request_filled(&oij.ij_req);
        if (o2info_to_user(oij, req))
                goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oij, req);
+                o2info_set_request_error(&oij.ij_req, req);
+        return status;
+}
+int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+                                struct inode *inode_alloc, u64 blkno,
+                                struct ocfs2_info_freeinode *fi, u32 slot)
+{
+        int status = 0, unlock = 0;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_dinode *dinode_alloc = NULL;
+        if (inode_alloc)
+                mutex_lock(&inode_alloc->i_mutex);
+        if (o2info_coherent(&fi->ifi_req)) {
+                status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                unlock = 1;
+        } else {
+                status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+        fi->ifi_stat[slot].lfi_total =
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+        fi->ifi_stat[slot].lfi_free =
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+bail:
+        if (unlock)
+                ocfs2_inode_unlock(inode_alloc, 0);
+        if (inode_alloc)
+                mutex_unlock(&inode_alloc->i_mutex);
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_handle_freeinode(struct inode *inode,
+                                struct ocfs2_info_request __user *req)
+{
+        u32 i;
+        u64 blkno = -1;
+        char namebuf[40];
+        int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+        struct ocfs2_info_freeinode *oifi = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *inode_alloc = NULL;
+        oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+        if (!oifi) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        if (o2info_from_user(*oifi, req))
+                goto bail;
+        oifi->ifi_slotnum = osb->max_slots;
+        for (i = 0; i < oifi->ifi_slotnum; i++) {
+                if (o2info_coherent(&oifi->ifi_req)) {
+                        inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+                        if (!inode_alloc) {
+                                mlog(ML_ERROR, "unable to get alloc inode in "
+                                    "slot %u\n", i);
+                                status = -EIO;
+                                goto bail;
+                        }
+                } else {
+                        ocfs2_sprintf_system_inode_name(namebuf,
+                                                        sizeof(namebuf),
+                                                        type, i);
+                        status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                            namebuf,
+                                                            strlen(namebuf),
+                                                            &blkno);
+                        if (status < 0) {
+                                status = -ENOENT;
+                                goto bail;
+                        }
+                }
+                status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+                if (status < 0)
+                        goto bail;
+                iput(inode_alloc);
+                inode_alloc = NULL;
+        }
+        o2info_set_request_filled(&oifi->ifi_req);
+        if (o2info_to_user(*oifi, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(&oifi->ifi_req, req);
+        kfree(oifi);
+        return status;
+}
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+                                   unsigned int chunksize)
+{
+        int index;
+        index = __ilog2_u32(chunksize);
+        if (index >= OCFS2_INFO_MAX_HIST)
+                index = OCFS2_INFO_MAX_HIST - 1;
+        hist->fc_chunks[index]++;
+        hist->fc_clusters[index] += chunksize;
+}
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+                               unsigned int chunksize)
+{
+        if (chunksize > stats->ffs_max)
+                stats->ffs_max = chunksize;
+        if (chunksize < stats->ffs_min)
+                stats->ffs_min = chunksize;
+        stats->ffs_avg += chunksize;
+        stats->ffs_free_chunks_real++;
+}
+void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+                           unsigned int chunksize)
+{
+        o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+        o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+                                   struct inode *gb_inode,
+                                   struct ocfs2_dinode *gb_dinode,
+                                   struct ocfs2_chain_rec *rec,
+                                   struct ocfs2_info_freefrag *ffg,
+                                   u32 chunks_in_group)
+{
+        int status = 0, used;
+        u64 blkno;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_group_desc *bg = NULL;
+        unsigned int max_bits, num_clusters;
+        unsigned int offset = 0, cluster, chunk;
+        unsigned int chunk_free, last_chunksize = 0;
+        if (!le32_to_cpu(rec->c_free))
+                goto bail;
+        do {
+                if (!bg)
+                        blkno = le64_to_cpu(rec->c_blkno);
+                else
+                        blkno = le64_to_cpu(bg->bg_next_group);
+                if (bh) {
+                        brelse(bh);
+                        bh = NULL;
+                }
+                if (o2info_coherent(&ffg->iff_req))
+                        status = ocfs2_read_group_descriptor(gb_inode,
+                                                             gb_dinode,
+                                                             blkno, &bh);
+                else
+                        status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog(ML_ERROR, "Can't read the group descriptor # "
+                             "%llu from device.", (unsigned long long)blkno);
+                        status = -EIO;
+                        goto bail;
+                }
+                bg = (struct ocfs2_group_desc *)bh->b_data;
+                if (!le16_to_cpu(bg->bg_free_bits_count))
+                        continue;
+                max_bits = le16_to_cpu(bg->bg_bits);
+                offset = 0;
+                for (chunk = 0; chunk < chunks_in_group; chunk++) {
+                        /*
+                         * last chunk may be not an entire one.
+                         */
+                        if ((offset + ffg->iff_chunksize) > max_bits)
+                                num_clusters = max_bits - offset;
+                        else
+                                num_clusters = ffg->iff_chunksize;
+                        chunk_free = 0;
+                        for (cluster = 0; cluster < num_clusters; cluster++) {
+                                used = ocfs2_test_bit(offset,
+                                                (unsigned long *)bg->bg_bitmap);
+                                /*
+                                 * - chunk_free counts free clusters in #N chunk.
+                                 * - last_chunksize records the size(in) clusters
+                                 *   for the last real free chunk being counted.
+                                 */
+                                if (!used) {
+                                        last_chunksize++;
+                                        chunk_free++;
+                                }
+                                if (used && last_chunksize) {
+                                        ocfs2_info_update_ffg(ffg,
+                                                              last_chunksize);
+                                        last_chunksize = 0;
+                                }
+                                offset++;
+                        }
+                        if (chunk_free == ffg->iff_chunksize)
+                                ffg->iff_ffs.ffs_free_chunks++;
+                }
+                /*
+                 * need to update the info for last free chunk.
+                 */
+                if (last_chunksize)
+                        ocfs2_info_update_ffg(ffg, last_chunksize);
+        } while (le64_to_cpu(bg->bg_next_group));
+bail:
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+                                    struct inode *gb_inode, u64 blkno,
+                                    struct ocfs2_info_freefrag *ffg)
+{
+        u32 chunks_in_group;
+        int status = 0, unlock = 0, i;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_chain_list *cl = NULL;
+        struct ocfs2_chain_rec *rec = NULL;
+        struct ocfs2_dinode *gb_dinode = NULL;
+        if (gb_inode)
+                mutex_lock(&gb_inode->i_mutex);
+        if (o2info_coherent(&ffg->iff_req)) {
+                status = ocfs2_inode_lock(gb_inode, &bh, 0);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                unlock = 1;
+        } else {
+                status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+        cl = &(gb_dinode->id2.i_chain);
+        /*
+         * Chunksize(in) clusters from userspace should be
+         * less than clusters in a group.
+         */
+        if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+                status = -EINVAL;
+                goto bail;
+        }
+        memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+        ffg->iff_ffs.ffs_min = ~0U;
+        ffg->iff_ffs.ffs_clusters =
+                        le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+        ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+                        le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+        chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+                rec = &(cl->cl_recs[i]);
+                status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+                                                        gb_dinode,
+                                                        rec, ffg,
+                                                        chunks_in_group);
+                if (status)
+                        goto bail;
+        }
+        if (ffg->iff_ffs.ffs_free_chunks_real)
+                ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+                                        ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+        if (unlock)
+                ocfs2_inode_unlock(gb_inode, 0);
+        if (gb_inode)
+                mutex_unlock(&gb_inode->i_mutex);
+        if (gb_inode)
+                iput(gb_inode);
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_handle_freefrag(struct inode *inode,
+                               struct ocfs2_info_request __user *req)
+{
+        u64 blkno = -1;
+        char namebuf[40];
+        int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+        struct ocfs2_info_freefrag *oiff;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *gb_inode = NULL;
+        oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+        if (!oiff) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        if (o2info_from_user(*oiff, req))
+                goto bail;
+        /*
+         * chunksize from userspace should be power of 2.
+         */
+        if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+            (!oiff->iff_chunksize)) {
+                status = -EINVAL;
+                goto bail;
+        }
+        if (o2info_coherent(&oiff->iff_req)) {
+                gb_inode = ocfs2_get_system_file_inode(osb, type,
+                                                       OCFS2_INVALID_SLOT);
+                if (!gb_inode) {
+                        mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+                        status = -EIO;
+                        goto bail;
+                }
+        } else {
+                ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+                                                OCFS2_INVALID_SLOT);
+                status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                    namebuf,
+                                                    strlen(namebuf),
+                                                    &blkno);
+                if (status < 0) {
+                        status = -ENOENT;
+                        goto bail;
+                }
+        }
+        status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+        if (status < 0)
+                goto bail;
+        o2info_set_request_filled(&oiff->iff_req);
+        if (o2info_to_user(*oiff, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(&oiff->iff_req, req);
+        kfree(oiff);
        return status;
 }
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        if (o2info_from_user(oir, req))
                goto bail;
-        o2info_clear_request_filled(oir);
+        o2info_clear_request_filled(&oir);
        if (o2info_to_user(oir, req))
                goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oir, req);
+                o2info_set_request_error(&oir, req);
        return status;
 }
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
                if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
                        status = ocfs2_info_handle_journal_size(inode, req);
                break;
+        case OCFS2_INFO_FREEINODE:
+                if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+                        status = ocfs2_info_handle_freeinode(inode, req);
+                break;
+        case OCFS2_INFO_FREEFRAG:
+                if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+                        status = ocfs2_info_handle_freefrag(inode, req);
+                break;
        default:
                status = ocfs2_info_handle_unknown(inode, req);
                break;
@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        return -EFAULT;
                return ocfs2_info_handle(inode, &info, 0);
+        case FITRIM:
+        {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                    sizeof(range)))
+                        return -EFAULT;
+                ret = ocfs2_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                    sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
+        case OCFS2_IOC_MOVE_EXT:
+                return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
        default:
                return -ENOTTY;
        }
@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC_GROUP_EXTEND:
        case OCFS2_IOC_GROUP_ADD:
        case OCFS2_IOC_GROUP_ADD64:
+        case FITRIM:
                break;
        case OCFS2_IOC_REFLINK:
                if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                        return -EFAULT;
                return ocfs2_info_handle(inode, &info, 1);
+        case OCFS2_IOC_MOVE_EXT:
+                break;
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000000..cd9427023d2e
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1152 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "suballoc.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+struct ocfs2_move_extents_context {
+        struct inode *inode;
+        struct file *file;
+        int auto_defrag;
+        int partial;
+        int credits;
+        u32 new_phys_cpos;
+        u32 clusters_moved;
+        u64 refcount_loc;
+        struct ocfs2_move_extents *range;
+        struct ocfs2_extent_tree et;
+        struct ocfs2_alloc_context *meta_ac;
+        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+static int __ocfs2_move_extent(handle_t *handle,
+                               struct ocfs2_move_extents_context *context,
+                               u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+                               int ext_flags)
+{
+        int ret = 0, index;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_extent_rec *rec, replace_rec;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el;
+        u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+        u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+        ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+                                               p_cpos, new_p_cpos, len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        memset(&replace_rec, 0, sizeof(replace_rec));
+        replace_rec.e_cpos = cpu_to_le32(cpos);
+        replace_rec.e_leaf_clusters = cpu_to_le16(len);
+        replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+                                                                   new_p_cpos));
+        path = ocfs2_new_path_from_et(&context->et);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has an extent at cpos %u which can no "
+                            "longer be found.\n",
+                            (unsigned long long)ino, cpos);
+                ret = -EROFS;
+                goto out;
+        }
+        rec = &el->l_recs[index];
+        BUG_ON(ext_flags != rec->e_flags);
+        /*
+         * after moving/defraging to new location, the extent is not going
+         * to be refcounted anymore.
+         */
+        replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+                                      context->et.et_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_split_extent(handle, &context->et, path, index,
+                                 &replace_rec, context->meta_ac,
+                                 &context->dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_journal_dirty(handle, context->et.et_root_bh);
+        context->new_phys_cpos = new_p_cpos;
+        /*
+         * need I to append truncate log for old clusters?
+         */
+        if (old_blkno) {
+                if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                        ret = ocfs2_decrease_refcount(inode, handle,
+                                        ocfs2_blocks_to_clusters(osb->sb,
+                                                                 old_blkno),
+                                        len, context->meta_ac,
+                                        &context->dealloc, 1);
+                else
+                        ret = ocfs2_truncate_log_append(osb, handle,
+                                                        old_blkno, len);
+        }
+out:
+        return ret;
+}
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+                                        struct ocfs2_extent_tree *et,
+                                        u32 clusters_to_move,
+                                        u32 extents_to_split,
+                                        struct ocfs2_alloc_context **meta_ac,
+                                        struct ocfs2_alloc_context **data_ac,
+                                        int extra_blocks,
+                                        int *credits)
+{
+        int ret, num_free_extents;
+        unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        num_free_extents = ocfs2_num_free_extents(osb, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+        ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (data_ac) {
+                ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
+                                              clusters_to_move + 2);
+        mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+             extra_blocks, clusters_to_move, *credits);
+out:
+        if (ret) {
+                if (*meta_ac) {
+                        ocfs2_free_alloc_context(*meta_ac);
+                        *meta_ac = NULL;
+                }
+        }
+        return ret;
+}
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ *  XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+                               u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+        int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u32 new_phys_cpos, new_len;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                BUG_ON(!context->refcount_loc);
+                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                        context->refcount_loc,
+                                                        phys_blkno,
+                                                        *len,
+                                                        &credits,
+                                                        &extra_blocks);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+                                                 &context->meta_ac,
+                                                 &context->data_ac,
+                                                 extra_blocks, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * should be using allocation reservation strategy there?
+         *
+         * if (context->data_ac)
+         *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+         */
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_unlock_mutex;
+                }
+        }
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock_mutex;
+        }
+        ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+                                     &new_phys_cpos, &new_len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * allowing partial extent moving is kind of 'pros and cons', it makes
+         * whole defragmentation less likely to fail, on the contrary, the bad
+         * thing is it may make the fs even more fragmented after moving, let
+         * userspace make a good decision here.
+         */
+        if (new_len != *len) {
+                mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+                if (!partial) {
+                        context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+                        ret = -ENOSPC;
+                        goto out_commit;
+                }
+        }
+        mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+             phys_cpos, new_phys_cpos);
+        ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+                                  new_phys_cpos, ext_flags);
+        if (ret)
+                mlog_errno(ret);
+        if (partial && (new_len != *len))
+                *len = new_len;
+        /*
+         * Here we should write the new page out first if we are
+         * in write-back mode.
+         */
+        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock_mutex:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (context->data_ac) {
+                ocfs2_free_alloc_context(context->data_ac);
+                context->data_ac = NULL;
+        }
+        if (context->meta_ac) {
+                ocfs2_free_alloc_context(context->meta_ac);
+                context->meta_ac = NULL;
+        }
+out:
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+        return ret;
+}
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+                                         u64 vict_blkno,
+                                         int type, int slot,
+                                         int *vict_bit,
+                                         struct buffer_head **ret_bh)
+{
+        int ret, i, bits_per_unit = 0;
+        u64 blkno;
+        char namebuf[40];
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+        struct ocfs2_chain_list *cl;
+        struct ocfs2_chain_rec *rec;
+        struct ocfs2_dinode *ac_dinode;
+        struct ocfs2_group_desc *bg;
+        ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+        ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+                                         strlen(namebuf), &blkno);
+        if (ret) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+        cl = &(ac_dinode->id2.i_chain);
+        rec = &(cl->cl_recs[0]);
+        if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+                bits_per_unit = osb->s_clustersize_bits -
+                                        inode->i_sb->s_blocksize_bits;
+        /*
+         * 'vict_blkno' was out of the valid range.
+         */
+        if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+            (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+                                bits_per_unit))) {
+                ret = -EINVAL;
+                goto out;
+        }
+        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+                rec = &(cl->cl_recs[i]);
+                if (!rec)
+                        continue;
+                bg = NULL;
+                do {
+                        if (!bg)
+                                blkno = le64_to_cpu(rec->c_blkno);
+                        else
+                                blkno = le64_to_cpu(bg->bg_next_group);
+                        if (gd_bh) {
+                                brelse(gd_bh);
+                                gd_bh = NULL;
+                        }
+                        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+                        if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+                                                le16_to_cpu(bg->bg_bits))) {
+                                *ret_bh = gd_bh;
+                                *vict_bit = (vict_blkno - blkno) >>
+                                                        bits_per_unit;
+                                mlog(0, "find the victim group: #%llu, "
+                                     "total_bits: %u, vict_bit: %u\n",
+                                     blkno, le16_to_cpu(bg->bg_bits),
+                                     *vict_bit);
+                                goto out;
+                        }
+                } while (le64_to_cpu(bg->bg_next_group));
+        }
+        ret = -EINVAL;
+out:
+        brelse(ac_bh);
+        /*
+         * caller has to release the gd_bh properly.
+         */
+        return ret;
+}
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+                                               struct ocfs2_move_extents *range)
+{
+        int ret, goal_bit = 0;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_group_desc *bg = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int c_to_b = 1 << (osb->s_clustersize_bits -
+                                        inode->i_sb->s_blocksize_bits);
+        /*
+         * make goal become cluster aligned.
+         */
+        range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
+                                                      range->me_goal);
+        /*
+         * moving goal is not allowd to start with a group desc blok(#0 blk)
+         * let's compromise to the latter cluster.
+         */
+        if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+                range->me_goal += c_to_b;
+        /*
+         * validate goal sits within global_bitmap, and return the victim
+         * group desc
+         */
+        ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+                                            GLOBAL_BITMAP_SYSTEM_INODE,
+                                            OCFS2_INVALID_SLOT,
+                                            &goal_bit, &gd_bh);
+        if (ret)
+                goto out;
+        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+        /*
+         * movement is not gonna cross two groups.
+         */
+        if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+                                                                range->me_len) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * more exact validations/adjustments will be performed later during
+         * moving operation for each extent range.
+         */
+        mlog(0, "extents get ready to be moved to #%llu block\n",
+             range->me_goal);
+out:
+        brelse(gd_bh);
+        return ret;
+}
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+                                    int *goal_bit, u32 move_len, u32 max_hop,
+                                    u32 *phys_cpos)
+{
+        int i, used, last_free_bits = 0, base_bit = *goal_bit;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                 le64_to_cpu(gd->bg_blkno));
+        for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+                used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+                if (used) {
+                        /*
+                         * we even tried searching the free chunk by jumping
+                         * a 'max_hop' distance, but still failed.
+                         */
+                        if ((i - base_bit) > max_hop) {
+                                *phys_cpos = 0;
+                                break;
+                        }
+                        if (last_free_bits)
+                                last_free_bits = 0;
+                        continue;
+                } else
+                        last_free_bits++;
+                if (last_free_bits == move_len) {
+                        *goal_bit = i;
+                        *phys_cpos = base_cpos + i;
+                        break;
+                }
+        }
+        mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+                                       handle_t *handle,
+                                       struct buffer_head *di_bh,
+                                       u32 num_bits,
+                                       u16 chain)
+{
+        int ret;
+        u32 tmp_used;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+        struct ocfs2_chain_list *cl =
+                                (struct ocfs2_chain_list *) &di->id2.i_chain;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+        ocfs2_journal_dirty(handle, di_bh);
+out:
+        return ret;
+}
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+                                             struct inode *alloc_inode,
+                                             struct ocfs2_group_desc *bg,
+                                             struct buffer_head *group_bh,
+                                             unsigned int bit_off,
+                                             unsigned int num_bits)
+{
+        int status;
+        void *bitmap = bg->bg_bitmap;
+        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+        /* All callers get the descriptor via
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+             num_bits);
+        if (ocfs2_is_cluster_bitmap(alloc_inode))
+                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+        status = ocfs2_journal_access_gd(handle,
+                                         INODE_CACHE(alloc_inode),
+                                         group_bh,
+                                         journal_type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                            " count %u but claims %u are freed. num_bits %d",
+                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                            le16_to_cpu(bg->bg_bits),
+                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
+                return -EROFS;
+        }
+        while (num_bits--)
+                ocfs2_set_bit(bit_off++, bitmap);
+        ocfs2_journal_dirty(handle, group_bh);
+bail:
+        return status;
+}
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+                             u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+                             u32 len, int ext_flags)
+{
+        int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct inode *gb_inode = NULL;
+        struct buffer_head *gb_bh = NULL;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_group_desc *gd;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                    context->range->me_threshold);
+        u64 phys_blkno, new_phys_blkno;
+        phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                BUG_ON(!context->refcount_loc);
+                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                        context->refcount_loc,
+                                                        phys_blkno,
+                                                        len,
+                                                        &credits,
+                                                        &extra_blocks);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+                                                 &context->meta_ac,
+                                                 NULL, extra_blocks, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * need to count 2 extra credits for global_bitmap inode and
+         * group descriptor.
+         */
+        credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+        /*
+         * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+         * logic, while we still need to lock the global_bitmap.
+         */
+        gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+                                               OCFS2_INVALID_SLOT);
+        if (!gb_inode) {
+                mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+                ret = -EIO;
+                goto out;
+        }
+        mutex_lock(&gb_inode->i_mutex);
+        ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_unlock_gb_mutex;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock_tl_inode;
+        }
+        new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+        ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+                                            GLOBAL_BITMAP_SYSTEM_INODE,
+                                            OCFS2_INVALID_SLOT,
+                                            &goal_bit, &gd_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * probe the victim cluster group to find a proper
+         * region to fit wanted movement, it even will perfrom
+         * a best-effort attempt by compromising to a threshold
+         * around the goal.
+         */
+        ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+                                new_phys_cpos);
+        if (!new_phys_cpos) {
+                ret = -ENOSPC;
+                goto out_commit;
+        }
+        ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+                                  *new_phys_cpos, ext_flags);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+        ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+                                               le16_to_cpu(gd->bg_chain));
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+                                         goal_bit, len);
+        if (ret)
+                mlog_errno(ret);
+        /*
+         * Here we should write the new page out first if we are
+         * in write-back mode.
+         */
+        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+        brelse(gd_bh);
+out_unlock_tl_inode:
+        mutex_unlock(&tl_inode->i_mutex);
+        ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+        mutex_unlock(&gb_inode->i_mutex);
+        brelse(gb_bh);
+        iput(gb_inode);
+out:
+        if (context->meta_ac) {
+                ocfs2_free_alloc_context(context->meta_ac);
+                context->meta_ac = NULL;
+        }
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+        return ret;
+}
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+                                         u32 threshold, int *skip)
+{
+        if ((*alloc_size + *len_defraged) < threshold) {
+                /*
+                 * proceed defragmentation until we meet the thresh
+                 */
+                *len_defraged += *alloc_size;
+        } else if (*len_defraged == 0) {
+                /*
+                 * XXX: skip a large extent.
+                 */
+                *skip = 1;
+        } else {
+                /*
+                 * split this extent to coalesce with former pieces as
+                 * to reach the threshold.
+                 *
+                 * we're done here with one cycle of defragmentation
+                 * in a size of 'thresh', resetting 'len_defraged'
+                 * forces a new defragmentation.
+                 */
+                *alloc_size = threshold - *len_defraged;
+                *len_defraged = 0;
+        }
+}
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+                                struct ocfs2_move_extents_context *context)
+{
+        int ret = 0, flags, do_defrag, skip = 0;
+        u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+        u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+        struct inode *inode = context->inode;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_move_extents *range = context->range;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if ((inode->i_size == 0) || (range->me_len == 0))
+                return 0;
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                return 0;
+        context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+        ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+        ocfs2_init_dealloc_ctxt(&context->dealloc);
+        /*
+         * TO-DO XXX:
+         *
+         * - xattr extents.
+         */
+        do_defrag = context->auto_defrag;
+        /*
+         * extents moving happens in unit of clusters, for the sake
+         * of simplicity, we may ignore two clusters where 'byte_start'
+         * and 'byte_start + len' were within.
+         */
+        move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+        len_to_move = (range->me_start + range->me_len) >>
+                                                osb->s_clustersize_bits;
+        if (len_to_move >= move_start)
+                len_to_move -= move_start;
+        else
+                len_to_move = 0;
+        if (do_defrag) {
+                defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+                if (defrag_thresh <= 1)
+                        goto done;
+        } else
+                new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                         range->me_goal);
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+             "thresh: %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             (unsigned long long)range->me_start,
+             (unsigned long long)range->me_len,
+             move_start, len_to_move, defrag_thresh);
+        cpos = move_start;
+        while (len_to_move) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+                                         &flags);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (alloc_size > len_to_move)
+                        alloc_size = len_to_move;
+                /*
+                 * XXX: how to deal with a hole:
+                 *
+                 * - skip the hole of course
+                 * - force a new defragmentation
+                 */
+                if (!phys_cpos) {
+                        if (do_defrag)
+                                len_defraged = 0;
+                        goto next;
+                }
+                if (do_defrag) {
+                        ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+                                                     defrag_thresh, &skip);
+                        /*
+                         * skip large extents
+                         */
+                        if (skip) {
+                                skip = 0;
+                                goto next;
+                        }
+                        mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+                             "alloc_size: %u, len_defraged: %u\n",
+                             cpos, phys_cpos, alloc_size, len_defraged);
+                        ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+                                                  &alloc_size, flags);
+                } else {
+                        ret = ocfs2_move_extent(context, cpos, phys_cpos,
+                                                &new_phys_cpos, alloc_size,
+                                                flags);
+                        new_phys_cpos += alloc_size;
+                }
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                context->clusters_moved += alloc_size;
+next:
+                cpos += alloc_size;
+                len_to_move -= alloc_size;
+        }
+done:
+        range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+out:
+        range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+                                                      context->clusters_moved);
+        range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+                                                       context->new_phys_cpos);
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &context->dealloc);
+        return ret;
+}
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+        int status;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_dinode *di;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (!inode)
+                return -ENOENT;
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        mutex_lock(&inode->i_mutex);
+        /*
+         * This prevents concurrent writes from other nodes
+         */
+        status = ocfs2_rw_lock(inode, 1);
+        if (status) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (status) {
+                mlog_errno(status);
+                goto out_rw_unlock;
+        }
+        /*
+         * rememer ip_xattr_sem also needs to be held if necessary
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        status = __ocfs2_move_extents_range(di_bh, context);
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        if (status) {
+                mlog_errno(status);
+                goto out_inode_unlock;
+        }
+        /*
+         * We update ctime for these changes
+         */
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_inode_unlock;
+        }
+        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        inode->i_ctime = CURRENT_TIME;
+        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_inode_unlock:
+        brelse(di_bh);
+        ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+        ocfs2_rw_unlock(inode, 1);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return status;
+}
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+        int status;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct ocfs2_move_extents range;
+        struct ocfs2_move_extents_context *context = NULL;
+        status = mnt_want_write(filp->f_path.mnt);
+        if (status)
+                return status;
+        if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+                goto out;
+        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+                status = -EPERM;
+                goto out;
+        }
+        context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+        if (!context) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        context->inode = inode;
+        context->file = filp;
+        if (argp) {
+                if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
+                                   sizeof(range))) {
+                        status = -EFAULT;
+                        goto out;
+                }
+        } else {
+                status = -EINVAL;
+                goto out;
+        }
+        if (range.me_start > i_size_read(inode))
+                goto out;
+        if (range.me_start + range.me_len > i_size_read(inode))
+                        range.me_len = i_size_read(inode) - range.me_start;
+        context->range = &range;
+        if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+                context->auto_defrag = 1;
+                /*
+                 * ok, the default theshold for the defragmentation
+                 * is 1M, since our maximum clustersize was 1M also.
+                 * any thought?
+                 */
+                if (!range.me_threshold)
+                        range.me_threshold = 1024 * 1024;
+                if (range.me_threshold > i_size_read(inode))
+                        range.me_threshold = i_size_read(inode);
+                if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+                        context->partial = 1;
+        } else {
+                /*
+                 * first best-effort attempt to validate and adjust the goal
+                 * (physical address in block), while it can't guarantee later
+                 * operation can succeed all the time since global_bitmap may
+                 * change a bit over time.
+                 */
+                status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+                if (status)
+                        goto out;
+        }
+        status = ocfs2_move_extents(context);
+        if (status)
+                mlog_errno(status);
+out:
+        /*
+         * movement/defragmentation may end up being partially completed,
+         * that's the reason why we need to return userspace the finished
+         * length and new_offset even if failure happens somewhere.
+         */
+        if (argp) {
+                if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
+                                sizeof(range)))
+                        status = -EFAULT;
+        }
+        kfree(context);
+        mnt_drop_write(filp->f_path.mnt);
+        return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000000..4e143e811441
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf7438..5b27ff1fa577 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
        __u64 ij_journal_size;
 };
+struct ocfs2_info_freeinode {
+        struct ocfs2_info_request ifi_req;
+        struct ocfs2_info_local_freeinode {
+                __u64 lfi_total;
+                __u64 lfi_free;
+        } ifi_stat[OCFS2_MAX_SLOTS];
+        __u32 ifi_slotnum; /* out */
+        __u32 ifi_pad;
+};
+#define OCFS2_INFO_MAX_HIST     (32)
+struct ocfs2_info_freefrag {
+        struct ocfs2_info_request iff_req;
+        struct ocfs2_info_freefrag_stats { /* (out) */
+                struct ocfs2_info_free_chunk_list {
+                        __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+                        __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+                } ffs_fc_hist;
+                __u32 ffs_clusters;
+                __u32 ffs_free_clusters;
+                __u32 ffs_free_chunks;
+                __u32 ffs_free_chunks_real;
+                __u32 ffs_min; /* Minimum free chunksize in clusters */
+                __u32 ffs_max;
+                __u32 ffs_avg;
+                __u32 ffs_pad;
+        } iff_ffs;
+        __u32 iff_chunksize; /* chunksize in clusters(in) */
+        __u32 iff_pad;
+};
 /* Codes for ocfs2_info_request */
 enum ocfs2_info_type {
        OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
        OCFS2_INFO_UUID,
        OCFS2_INFO_FS_FEATURES,
        OCFS2_INFO_JOURNAL_SIZE,
+        OCFS2_INFO_FREEINODE,
+        OCFS2_INFO_FREEFRAG,
        OCFS2_INFO_NUM_TYPES
 };
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
 #define OCFS2_IOC_INFO          _IOR('o', 5, struct ocfs2_info)
+struct ocfs2_move_extents {
+/* All values are in bytes */
+        /* in */
+        __u64 me_start;         /* Virtual start in the file to move */
+        __u64 me_len;           /* Length of the extents to be moved */
+        __u64 me_goal;          /* Physical offset of the goal,
+                                   it's in block unit */
+        __u64 me_threshold;     /* Maximum distance from goal or threshold
+                                   for auto defragmentation */
+        __u64 me_flags;         /* Flags for the operation:
+                                 * - auto defragmentation.
+                                 * - refcount,xattr cases.
+                                 */
+        /* out */
+        __u64 me_moved_len;     /* Moved/defraged length */
+        __u64 me_new_offset;    /* Resulting physical location */
+        __u32 me_reserved[2];   /* Reserved for futhure */
+};
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG   (0x00000001)    /* Kernel manages to
+                                                           claim new clusters
+                                                           as the goal place
+                                                           for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG   (0x00000002)    /* Allow partial extent
+                                                           moving, is to make
+                                                           movement less likely
+                                                           to fail, may make fs
+                                                           even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE      (0x00000004)    /* Move or defragmenation
+                                                           completely gets done.
+                                                         */
+#define OCFS2_IOC_MOVE_EXT      _IOW('o', 6, struct ocfs2_move_extents)
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5bb54ac..3b481f490633 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
                  __entry->blkno, __entry->bit)
 );
+TRACE_EVENT(ocfs2_trim_extent,
+        TP_PROTO(struct super_block *sb, unsigned long long blk,
+                 unsigned long long count),
+        TP_ARGS(sb, blk, count),
+        TP_STRUCT__entry(
+                __field(int, dev_major)
+                __field(int, dev_minor)
+                __field(unsigned long long, blk)
+                __field(__u64,  count)
+        ),
+        TP_fast_assign(
+                __entry->dev_major = MAJOR(sb->s_dev);
+                __entry->dev_minor = MINOR(sb->s_dev);
+                __entry->blk = blk;
+                __entry->count = count;
+        ),
+        TP_printk("%d %d %llu %llu",
+                  __entry->dev_major, __entry->dev_minor,
+                  __entry->blk, __entry->count)
+);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
 /* End of trace events for fs/ocfs2/alloc.c. */
 /* Trace events for fs/ocfs2/localalloc.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5d32749c896d..ebfd3825f12a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
                            u32 *num_clusters,
                            unsigned int *extent_flags);
        int (*cow_duplicate_clusters)(handle_t *handle,
-                                      struct ocfs2_cow_context *context,
+                                      struct file *file,
                                      u32 cpos, u32 old_cluster,
                                      u32 new_cluster, u32 new_len);
 };
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-                                            struct ocfs2_cow_context *context,
+                                     struct file *file,
-                                            u32 cpos, u32 old_cluster,
+                                     u32 cpos, u32 old_cluster,
-                                            u32 new_cluster, u32 new_len)
+                                     u32 new_cluster, u32 new_len)
 {
        int ret = 0, partial;
-        struct ocfs2_caching_info *ci = context->data_et.et_ci;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
        struct page *page;
        pgoff_t page_index;
        unsigned int from, to, readahead_pages;
        loff_t offset, end, map_end;
-        struct address_space *mapping = context->inode->i_mapping;
+        struct address_space *mapping = inode->i_mapping;
        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
                                               new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
         * We only duplicate pages until we reach the page contains i_size - 1.
         * So trim 'end' to i_size.
         */
-        if (end > i_size_read(context->inode))
+        if (end > i_size_read(inode))
-                end = i_size_read(context->inode);
+                end = i_size_read(inode);
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                        BUG_ON(PageDirty(page));
-                if (PageReadahead(page) && context->file) {
+                if (PageReadahead(page)) {
                        page_cache_async_readahead(mapping,
-                                                   &context->file->f_ra,
+                                                   &file->f_ra, file,
-                                                   context->file,
                                                   page, page_index,
                                                   readahead_pages);
                }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                        }
                }
-                ocfs2_map_and_dirty_page(context->inode,
+                ocfs2_map_and_dirty_page(inode, handle, from, to,
-                                         handle, from, to,
                                         page, 0, &new_block);
                mark_page_accessed(page);
 unlock:
@@ -3015,14 +3014,15 @@ unlock:
        return ret;
 }
-static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-                                           struct ocfs2_cow_context *context,
+                                    struct file *file,
-                                           u32 cpos, u32 old_cluster,
+                                    u32 cpos, u32 old_cluster,
-                                           u32 new_cluster, u32 new_len)
+                                    u32 new_cluster, u32 new_len)
 {
        int ret = 0;
-        struct super_block *sb = context->inode->i_sb;
+        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_caching_info *ci = context->data_et.et_ci;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
        u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
        /*If the old clusters is unwritten, no need to duplicate. */
        if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                ret = context->cow_duplicate_clusters(handle, context, cpos,
+                ret = context->cow_duplicate_clusters(handle, context->file,
-                                                      old, new, len);
+                                                      cpos, old, new, len);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3162,22 +3162,22 @@ out:
        return ret;
 }
-static int ocfs2_cow_sync_writeback(struct super_block *sb,
+int ocfs2_cow_sync_writeback(struct super_block *sb,
-                                    struct ocfs2_cow_context *context,
+                             struct inode *inode,
-                                    u32 cpos, u32 num_clusters)
+                             u32 cpos, u32 num_clusters)
 {
        int ret = 0;
        loff_t offset, end, map_end;
        pgoff_t page_index;
        struct page *page;
-        if (ocfs2_should_order_data(context->inode))
+        if (ocfs2_should_order_data(inode))
                return 0;
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
-        ret = filemap_fdatawrite_range(context->inode->i_mapping,
+        ret = filemap_fdatawrite_range(inode->i_mapping,
                                       offset, end - 1);
        if (ret < 0) {
                mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
                if (map_end > end)
                        map_end = end;
-                page = find_or_create_page(context->inode->i_mapping,
+                page = find_or_create_page(inode->i_mapping,
                                           page_index, GFP_NOFS);
                BUG_ON(!page);
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-                ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+                ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
@@ -3706,7 +3706,7 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
        context->cow_start = cow_start;
        context->cow_len = cow_len;
        context->ref_tree = ref_tree;
-        context->ref_root_bh = ref_root_bh;;
+        context->ref_root_bh = ref_root_bh;
        context->cow_object = xv;
        context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e3..7754608c83a4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
                             struct buffer_head *ref_root_bh,
                             u32 cpos, u32 write_len,
                             struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+                                     struct file *file,
+                                     u32 cpos, u32 old_cluster,
+                                     u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+                                    struct file *file,
+                                    u32 cpos, u32 old_cluster,
+                                    u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+                             struct inode *inode,
+                             u32 cpos, u32 num_clusters);
 int ocfs2_add_refcount_flag(struct inode *inode,
                            struct ocfs2_extent_tree *data_et,
                            struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5a521c748859..cdbaf5e97308 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/cleancache.h>
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
@@ -1566,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->preferred_slot != OCFS2_INVALID_SLOT)
                seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
-        if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
+        if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
                seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
        if (osb->osb_commit_interval)
@@ -2352,6 +2353,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                mlog_errno(status);
                goto bail;
        }
+        cleancache_init_shared_fs((char *)&uuid_net_key, sb);
 bail:
        return status;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index de4ff29f1e05..c368360c35a1 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int ret;
-        if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
-                return -ENOTEMPTY;
+        if (S_ISDIR(inode->i_mode)) {
+                dentry_unhash(dentry);
+                if (!omfs_dir_is_empty(inode))
+                        return -ENOTEMPTY;
+        }
        ret = omfs_delete_entry(dentry);
        if (ret)
@@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        int err;
        if (new_inode) {
+                if (S_ISDIR(new_inode->i_mode))
+                        dentry_unhash(new_dentry);
                /* overwriting existing file/dir */
                err = omfs_remove(new_dir, new_dentry);
                if (err)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d545e97d99c3..f82e762eeca2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,13 @@ ssize_t part_discard_alignment_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%u\n", p->discard_alignment);
+        struct gendisk *disk = dev_to_disk(dev);
+        unsigned int alignment = 0;
+        if (disk->queue)
+                alignment = queue_limit_discard_alignment(&disk->queue->limits,
+                                                                p->start_sect);
+        return sprintf(buf, "%u\n", alignment);
 }
 ssize_t part_stat_show(struct device *dev,
@@ -449,8 +455,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->start_sect = start;
        p->alignment_offset =
                queue_limit_alignment_offset(&disk->queue->limits, start);
-        p->discard_alignment =
-                queue_limit_discard_alignment(&disk->queue->limits, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 19d6750d1d6c..6296b403c67a 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -310,6 +310,15 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
                goto fail;
        }
+        /* Check the GUID Partition Table header size */
+        if (le32_to_cpu((*gpt)->header_size) >
+                        bdev_logical_block_size(state->bdev)) {
+                pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
+                        le32_to_cpu((*gpt)->header_size),
+                        bdev_logical_block_size(state->bdev));
+                goto fail;
+        }
        /* Check the GUID Partition Table CRC */
        origcrc = le32_to_cpu((*gpt)->header_crc32);
        (*gpt)->header_crc32 = 0;
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index ce4f62440425..af9fdf046769 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -565,7 +565,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state)
        data = read_part_sector(state, 0, &sect);
        if (!data) {
-                ldm_crit ("Disk read failed.");
+                ldm_info ("Disk read failed.");
                return false;
        }
@@ -1335,6 +1335,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
        list_add_tail (&f->list, frags);
 found:
+        if (rec >= f->num) {
+                ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
+                return false;
+        }
        if (f->map & (1 << rec)) {
                ldm_error ("Duplicate VBLK, part %d.", rec);
                f->map &= 0x7F;                 /* Mark the group as broken */
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index df434c5f28fb..c1c729335924 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -20,6 +20,7 @@ proc-y	+= stat.o
 proc-y  += uptime.o
 proc-y  += version.o
 proc-y  += softirqs.o
+proc-y  += namespaces.o
 proc-$(CONFIG_PROC_SYSCTL)      += proc_sysctl.o
 proc-$(CONFIG_NET)              += proc_net.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5e4f776b0917..9b45ee84fbcc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -131,7 +131,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
 * you can test for combinations of others with
 * simple bit tests.
 */
-static const char *task_state_array[] = {
+static const char * const task_state_array[] = {
        "R (running)",          /*   0 */
        "S (sleeping)",         /*   1 */
        "D (disk sleep)",       /*   2 */
@@ -147,7 +147,7 @@ static const char *task_state_array[] = {
 static inline const char *get_task_state(struct task_struct *tsk)
 {
        unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
-        const char **p = &task_state_array[0];
+        const char * const *p = &task_state_array[0];
        BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa532730e55..4ede550517a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode)
        return allowed;
 }
-static int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
 {
        int error;
        struct inode *inode = dentry->d_inode;
@@ -894,20 +894,20 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        if (!task)
                goto out_no_task;
+        copied = -ENOMEM;
+        page = (char *)__get_free_page(GFP_TEMPORARY);
+        if (!page)
+                goto out_task;
        mm = check_mem_permission(task);
        copied = PTR_ERR(mm);
        if (IS_ERR(mm))
-                goto out_task;
+                goto out_free;
        copied = -EIO;
        if (file->private_data != (void *)((long)current->self_exec_id))
                goto out_mm;
-        copied = -ENOMEM;
-        page = (char *)__get_free_page(GFP_TEMPORARY);
-        if (!page)
-                goto out_mm;
        copied = 0;
        while (count > 0) {
                int this_len, retval;
@@ -929,9 +929,11 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
                count -= retval;                        
        }
        *ppos = dst;
-        free_page((unsigned long) page);
 out_mm:
        mmput(mm);
+out_free:
+        free_page((unsigned long) page);
 out_task:
        put_task_struct(task);
 out_no_task:
@@ -1059,7 +1061,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 {
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
-        long oom_adjust;
+        int oom_adjust;
        unsigned long flags;
        int err;
@@ -1071,7 +1073,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                goto out;
        }
-        err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
+        err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
        if (err)
                goto out;
        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
@@ -1168,7 +1170,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
        unsigned long flags;
-        long oom_score_adj;
+        int oom_score_adj;
        int err;
        memset(buffer, 0, sizeof(buffer));
@@ -1179,7 +1181,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto out;
        }
-        err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+        err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
        if (err)
                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
@@ -1468,7 +1470,7 @@ sched_autogroup_write(struct file *file, const char __user *buf,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct task_struct *p;
        char buffer[PROC_NUMBUF];
-        long nice;
+        int nice;
        int err;
        memset(buffer, 0, sizeof(buffer));
@@ -1477,9 +1479,9 @@ sched_autogroup_write(struct file *file, const char __user *buf,
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-        err = strict_strtol(strstrip(buffer), 0, &nice);
+        err = kstrtoint(strstrip(buffer), 0, &nice);
-        if (err)
+        if (err < 0)
-                return -EINVAL;
+                return err;
        p = get_proc_task(inode);
        if (!p)
@@ -1576,57 +1578,6 @@ static const struct file_operations proc_pid_set_comm_operations = {
        .release        = single_release,
 };
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-        mm->num_exe_file_vmas++;
-}
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-        mm->num_exe_file_vmas--;
-        if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
-                fput(mm->exe_file);
-                mm->exe_file = NULL;
-        }
-}
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
-{
-        if (new_exe_file)
-                get_file(new_exe_file);
-        if (mm->exe_file)
-                fput(mm->exe_file);
-        mm->exe_file = new_exe_file;
-        mm->num_exe_file_vmas = 0;
-}
-struct file *get_mm_exe_file(struct mm_struct *mm)
-{
-        struct file *exe_file;
-        /* We need mmap_sem to protect against races with removal of
-         * VM_EXECUTABLE vmas */
-        down_read(&mm->mmap_sem);
-        exe_file = mm->exe_file;
-        if (exe_file)
-                get_file(exe_file);
-        up_read(&mm->mmap_sem);
-        return exe_file;
-}
-void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
-        /* It's safe to write the exe_file pointer without exe_file_lock because
-         * this is called during fork when the task is not yet in /proc */
-        newmm->exe_file = get_mm_exe_file(oldmm);
-}
 static int proc_exe_link(struct inode *inode, struct path *exe_path)
 {
        struct task_struct *task;
@@ -1736,8 +1687,7 @@ static int task_dumpable(struct task_struct *task)
        return 0;
 }
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
-static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
 {
        struct inode * inode;
        struct proc_inode *ei;
@@ -1779,7 +1729,7 @@ out_unlock:
        return NULL;
 }
-static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
        struct task_struct *task;
@@ -1820,7 +1770,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 * made this apply to all per process world readable and executable
 * directories.
 */
-static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode;
        struct task_struct *task;
@@ -1862,7 +1812,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
        return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
-static const struct dentry_operations pid_dentry_operations =
+const struct dentry_operations pid_dentry_operations =
 {
        .d_revalidate   = pid_revalidate,
        .d_delete       = pid_delete_dentry,
@@ -1870,9 +1820,6 @@ static const struct dentry_operations pid_dentry_operations =
 /* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
-                                struct task_struct *, const void *);
 /*
 * Fill a directory entry.
 *
@@ -1885,8 +1832,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
-static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-        char *name, int len,
+        const char *name, int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
        struct dentry *child, *dir = filp->f_path.dentry;
@@ -2820,6 +2767,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
@@ -3168,6 +3116,7 @@ out_no_task:
 static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+        DIR("ns",        S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f1281339b6fa..f1637f17c37c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -674,6 +674,7 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
        }
        return ent;
 }
+EXPORT_SYMBOL(proc_mkdir_mode);
 struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
                struct proc_dir_entry *parent)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d15aa1b1cc8f..74b48cfa1bb2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
        struct ctl_table_header *head;
+        const struct proc_ns_operations *ns_ops;
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
@@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode)
                rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
                sysctl_head_put(head);
        }
+        /* Release any associated namespace */
+        ns_ops = PROC_I(inode)->ns_ops;
+        if (ns_ops && ns_ops->put)
+                ns_ops->put(PROC_I(inode)->ns);
 }
 static struct kmem_cache * proc_inode_cachep;
@@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        ei->pde = NULL;
        ei->sysctl = NULL;
        ei->sysctl_entry = NULL;
+        ei->ns = NULL;
+        ei->ns_ops = NULL;
        inode = &ei->vfs_inode;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c03e8d3a3a5b..7838e5cfec14 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
+struct proc_maps_private {
+        struct pid *pid;
+        struct task_struct *task;
+#ifdef CONFIG_MMU
+        struct vm_area_struct *tail_vma;
+#endif
+};
 void proc_init_inodecache(void);
 static inline struct pid *proc_pid(struct inode *inode)
@@ -119,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 */
 int proc_readdir(struct file *, void *, filldir_t);
 struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+/* Lookups */
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+                                struct task_struct *, const void *);
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+        const char *name, int len,
+        instantiate_t instantiate, struct task_struct *task, const void *ptr);
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
+extern const struct dentry_operations pid_dentry_operations;
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+int proc_setattr(struct dentry *dentry, struct iattr *attr);
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 000000000000..781dec5bd682
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,198 @@
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+        &netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+        &utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+        &ipcns_operations,
+#endif
+};
+static const struct file_operations ns_file_operations = {
+        .llseek         = no_llseek,
+};
+static struct dentry *proc_ns_instantiate(struct inode *dir,
+        struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+        const struct proc_ns_operations *ns_ops = ptr;
+        struct inode *inode;
+        struct proc_inode *ei;
+        struct dentry *error = ERR_PTR(-ENOENT);
+        inode = proc_pid_make_inode(dir->i_sb, task);
+        if (!inode)
+                goto out;
+        ei = PROC_I(inode);
+        inode->i_mode = S_IFREG|S_IRUSR;
+        inode->i_fop  = &ns_file_operations;
+        ei->ns_ops    = ns_ops;
+        ei->ns        = ns_ops->get(task);
+        if (!ei->ns)
+                goto out_iput;
+        dentry->d_op = &pid_dentry_operations;
+        d_add(dentry, inode);
+        /* Close the race of the process dying before we return the dentry */
+        if (pid_revalidate(dentry, NULL))
+                error = NULL;
+out:
+        return error;
+out_iput:
+        iput(inode);
+        goto out;
+}
+static int proc_ns_fill_cache(struct file *filp, void *dirent,
+        filldir_t filldir, struct task_struct *task,
+        const struct proc_ns_operations *ops)
+{
+        return proc_fill_cache(filp, dirent, filldir,
+                                ops->name, strlen(ops->name),
+                                proc_ns_instantiate, task, ops);
+}
+static int proc_ns_dir_readdir(struct file *filp, void *dirent,
+                                filldir_t filldir)
+{
+        int i;
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *task = get_proc_task(inode);
+        const struct proc_ns_operations **entry, **last;
+        ino_t ino;
+        int ret;
+        ret = -ENOENT;
+        if (!task)
+                goto out_no_task;
+        ret = -EPERM;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out;
+        ret = 0;
+        i = filp->f_pos;
+        switch (i) {
+        case 0:
+                ino = inode->i_ino;
+                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+                        goto out;
+                i++;
+                filp->f_pos++;
+                /* fall through */
+        case 1:
+                ino = parent_ino(dentry);
+                if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                        goto out;
+                i++;
+                filp->f_pos++;
+                /* fall through */
+        default:
+                i -= 2;
+                if (i >= ARRAY_SIZE(ns_entries)) {
+                        ret = 1;
+                        goto out;
+                }
+                entry = ns_entries + i;
+                last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+                while (entry <= last) {
+                        if (proc_ns_fill_cache(filp, dirent, filldir,
+                                                task, *entry) < 0)
+                                goto out;
+                        filp->f_pos++;
+                        entry++;
+                }
+        }
+        ret = 1;
+out:
+        put_task_struct(task);
+out_no_task:
+        return ret;
+}
+const struct file_operations proc_ns_dir_operations = {
+        .read           = generic_read_dir,
+        .readdir        = proc_ns_dir_readdir,
+};
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+                                struct dentry *dentry, struct nameidata *nd)
+{
+        struct dentry *error;
+        struct task_struct *task = get_proc_task(dir);
+        const struct proc_ns_operations **entry, **last;
+        unsigned int len = dentry->d_name.len;
+        error = ERR_PTR(-ENOENT);
+        if (!task)
+                goto out_no_task;
+        error = ERR_PTR(-EPERM);
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out;
+        last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+        for (entry = ns_entries; entry <= last; entry++) {
+                if (strlen((*entry)->name) != len)
+                        continue;
+                if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+                        break;
+        }
+        error = ERR_PTR(-ENOENT);
+        if (entry > last)
+                goto out;
+        error = proc_ns_instantiate(dir, dentry, task, *entry);
+out:
+        put_task_struct(task);
+out_no_task:
+        return error;
+}
+const struct inode_operations proc_ns_dir_inode_operations = {
+        .lookup         = proc_ns_dir_lookup,
+        .getattr        = pid_getattr,
+        .setattr        = proc_setattr,
+};
+struct file *proc_ns_fget(int fd)
+{
+        struct file *file;
+        file = fget(fd);
+        if (!file)
+                return ERR_PTR(-EBADF);
+        if (file->f_op != &ns_file_operations)
+                goto out_invalid;
+        return file;
+out_invalid:
+        fput(file);
+        return ERR_PTR(-EINVAL);
+}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cffa2b8a2fc..9758b654a1bc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -138,9 +138,9 @@ static int stat_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int res;
-        /* don't ask for more than the kmalloc() max size, currently 128 KB */
+        /* don't ask for more than the kmalloc() max size */
-        if (size > 128 * 1024)
+        if (size > KMALLOC_MAX_SIZE)
-                size = 128 * 1024;
+                size = KMALLOC_MAX_SIZE;
        buf = kmalloc(size, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 318d8654989b..25b6a887adb9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
-        int flags = vma->vm_flags;
+        vm_flags_t flags = vma->vm_flags;
        unsigned long ino = 0;
        unsigned long long pgoff = 0;
        unsigned long start, end;
@@ -536,15 +536,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
        char buffer[PROC_NUMBUF];
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-        long type;
+        int type;
+        int rv;
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-        if (strict_strtol(strstrip(buffer), 10, &type))
+        rv = kstrtoint(strstrip(buffer), 10, &type);
-                return -EINVAL;
+        if (rv < 0)
+                return rv;
        if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
                return -EINVAL;
        task = get_proc_task(file->f_path.dentry->d_inode);
@@ -769,18 +771,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!task)
                goto out;
-        mm = mm_for_maps(task);
-        ret = PTR_ERR(mm);
-        if (!mm || IS_ERR(mm))
-                goto out_task;
        ret = -EINVAL;
        /* file position must be aligned */
        if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
                goto out_task;
        ret = 0;
        if (!count)
                goto out_task;
@@ -788,7 +784,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
        ret = -ENOMEM;
        if (!pm.buffer)
-                goto out_mm;
+                goto out_task;
+        mm = mm_for_maps(task);
+        ret = PTR_ERR(mm);
+        if (!mm || IS_ERR(mm))
+                goto out_free;
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
@@ -831,7 +832,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                len = min(count, PM_ENTRY_BYTES * pm.pos);
                if (copy_to_user(buf, pm.buffer, len)) {
                        ret = -EFAULT;
-                        goto out_free;
+                        goto out_mm;
                }
                copied += len;
                buf += len;
@@ -841,10 +842,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!ret || ret == PM_END_OF_BUFFER)
                ret = copied;
-out_free:
-        kfree(pm.buffer);
 out_mm:
        mmput(mm);
+out_free:
+        kfree(pm.buffer);
 out_task:
        put_task_struct(task);
 out:
@@ -858,7 +859,192 @@ const struct file_operations proc_pagemap_operations = {
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 #ifdef CONFIG_NUMA
-extern int show_numa_map(struct seq_file *m, void *v);
+struct numa_maps {
+        struct vm_area_struct *vma;
+        unsigned long pages;
+        unsigned long anon;
+        unsigned long active;
+        unsigned long writeback;
+        unsigned long mapcount_max;
+        unsigned long dirty;
+        unsigned long swapcache;
+        unsigned long node[MAX_NUMNODES];
+};
+struct numa_maps_private {
+        struct proc_maps_private proc_maps;
+        struct numa_maps md;
+};
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+{
+        int count = page_mapcount(page);
+        md->pages++;
+        if (pte_dirty || PageDirty(page))
+                md->dirty++;
+        if (PageSwapCache(page))
+                md->swapcache++;
+        if (PageActive(page) || PageUnevictable(page))
+                md->active++;
+        if (PageWriteback(page))
+                md->writeback++;
+        if (PageAnon(page))
+                md->anon++;
+        if (count > md->mapcount_max)
+                md->mapcount_max = count;
+        md->node[page_to_nid(page)]++;
+}
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+                unsigned long end, struct mm_walk *walk)
+{
+        struct numa_maps *md;
+        spinlock_t *ptl;
+        pte_t *orig_pte;
+        pte_t *pte;
+        md = walk->private;
+        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        do {
+                struct page *page;
+                int nid;
+                if (!pte_present(*pte))
+                        continue;
+                page = vm_normal_page(md->vma, addr, *pte);
+                if (!page)
+                        continue;
+                if (PageReserved(page))
+                        continue;
+                nid = page_to_nid(page);
+                if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+                        continue;
+                gather_stats(page, md, pte_dirty(*pte));
+        } while (pte++, addr += PAGE_SIZE, addr != end);
+        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+                unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+        struct numa_maps *md;
+        struct page *page;
+        if (pte_none(*pte))
+                return 0;
+        page = pte_page(*pte);
+        if (!page)
+                return 0;
+        md = walk->private;
+        gather_stats(page, md, pte_dirty(*pte));
+        return 0;
+}
+#else
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+                unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+        return 0;
+}
+#endif
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v)
+{
+        struct numa_maps_private *numa_priv = m->private;
+        struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+        struct vm_area_struct *vma = v;
+        struct numa_maps *md = &numa_priv->md;
+        struct file *file = vma->vm_file;
+        struct mm_struct *mm = vma->vm_mm;
+        struct mm_walk walk = {};
+        struct mempolicy *pol;
+        int n;
+        char buffer[50];
+        if (!mm)
+                return 0;
+        /* Ensure we start with an empty set of numa_maps statistics. */
+        memset(md, 0, sizeof(*md));
+        md->vma = vma;
+        walk.hugetlb_entry = gather_hugetbl_stats;
+        walk.pmd_entry = gather_pte_stats;
+        walk.private = md;
+        walk.mm = mm;
+        pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+        mpol_to_str(buffer, sizeof(buffer), pol, 0);
+        mpol_cond_put(pol);
+        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+        if (file) {
+                seq_printf(m, " file=");
+                seq_path(m, &file->f_path, "\n\t= ");
+        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+                seq_printf(m, " heap");
+        } else if (vma->vm_start <= mm->start_stack &&
+                        vma->vm_end >= mm->start_stack) {
+                seq_printf(m, " stack");
+        }
+        walk_page_range(vma->vm_start, vma->vm_end, &walk);
+        if (!md->pages)
+                goto out;
+        if (md->anon)
+                seq_printf(m, " anon=%lu", md->anon);
+        if (md->dirty)
+                seq_printf(m, " dirty=%lu", md->dirty);
+        if (md->pages != md->anon && md->pages != md->dirty)
+                seq_printf(m, " mapped=%lu", md->pages);
+        if (md->mapcount_max > 1)
+                seq_printf(m, " mapmax=%lu", md->mapcount_max);
+        if (md->swapcache)
+                seq_printf(m, " swapcache=%lu", md->swapcache);
+        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+                seq_printf(m, " active=%lu", md->active);
+        if (md->writeback)
+                seq_printf(m, " writeback=%lu", md->writeback);
+        for_each_node_state(n, N_HIGH_MEMORY)
+                if (md->node[n])
+                        seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+        seq_putc(m, '\n');
+        if (m->count < m->size)
+                m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+        return 0;
+}
 static const struct seq_operations proc_pid_numa_maps_op = {
        .start  = m_start,
@@ -869,7 +1055,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {
 static int numa_maps_open(struct inode *inode, struct file *file)
 {
-        return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+        struct numa_maps_private *priv;
+        int ret = -ENOMEM;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (priv) {
+                priv->proc_maps.pid = proc_pid(inode);
+                ret = seq_open(file, &proc_pid_numa_maps_op);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = priv;
+                } else {
+                        kfree(priv);
+                }
+        }
+        return ret;
 }
 const struct file_operations proc_numa_maps_operations = {
@@ -878,4 +1077,4 @@ const struct file_operations proc_numa_maps_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
-#endif
+#endif /* CONFIG_NUMA */
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 74802bc5ded9..cd99bf557650 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -35,6 +35,46 @@ static u64 vmcore_size;
 static struct proc_dir_entry *proc_vmcore = NULL;
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * The called function has to take care of module refcounting.
+ */
+static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+        if (oldmem_pfn_is_ram)
+                return -EBUSY;
+        oldmem_pfn_is_ram = fn;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+void unregister_oldmem_pfn_is_ram(void)
+{
+        oldmem_pfn_is_ram = NULL;
+        wmb();
+}
+EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+static int pfn_is_ram(unsigned long pfn)
+{
+        int (*fn)(unsigned long pfn);
+        /* pfn is ram unless fn() checks pagetype */
+        int ret = 1;
+        /*
+         * Ask hypervisor if the pfn is really ram.
+         * A ballooned page contains no data and reading from such a page
+         * will cause high load in the hypervisor.
+         */
+        fn = oldmem_pfn_is_ram;
+        if (fn)
+                ret = fn(pfn);
+        return ret;
+}
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
                                u64 *ppos, int userbuf)
@@ -55,9 +95,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
                else
                        nr_bytes = count;
-                tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf);
+                /* If pfn is not ram, return zeros for sparse dump files */
-                if (tmp < 0)
+                if (pfn_is_ram(pfn) == 0)
-                        return tmp;
+                        memset(buf, 0, nr_bytes);
+                else {
+                        tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+                                                offset, userbuf);
+                        if (tmp < 0)
+                                return tmp;
+                }
                *ppos += nr_bytes;
                count -= nr_bytes;
                buf += nr_bytes;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f835a25625ff..f2c3ff20ea68 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -152,21 +152,27 @@ EXPORT_SYMBOL_GPL(pstore_register);
 void pstore_get_records(void)
 {
        struct pstore_info *psi = psinfo;
-        size_t                  size;
+        ssize_t                 size;
        u64                     id;
        enum pstore_type_id     type;
        struct timespec         time;
-        int                     failed = 0;
+        int                     failed = 0, rc;
        if (!psi)
                return;
        mutex_lock(&psinfo->buf_mutex);
+        rc = psi->open(psi);
+        if (rc)
+                goto out;
        while ((size = psi->read(&id, &type, &time)) > 0) {
-                if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+                if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
                                  time, psi->erase))
                        failed++;
        }
+        psi->close(psi);
+out:
        mutex_unlock(&psinfo->buf_mutex);
        if (failed)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d3c032f5fa0a..5b572c89e6c4 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -691,8 +691,11 @@ static void prune_dqcache(int count)
 * This is called from kswapd when we think we need some
 * more memory
 */
-static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink,
+                                 struct shrink_control *sc)
 {
+        int nr = sc->nr_to_scan;
        if (nr) {
                spin_lock(&dq_list_lock);
                prune_dqcache(nr);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 118662690cdf..76c8164d5651 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
        INITIALIZE_PATH(path);
        struct reiserfs_dir_entry de;
+        dentry_unhash(dentry);
        /* we will be doing 2 balancings and update 2 stat data, we change quotas
         * of the owner of the directory and of the owner of the parent directory.
         * The quota structure is possibly deleted only on last iput => outside
@@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        unsigned long savelink = 1;
        struct timespec ctime;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        /* three balancings: (1) old name removal, (2) new name insertion
           and (3) maybe "save" link insertion
           stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 47d2a4498b03..50f1abccd1cd 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
        mutex_unlock(&dentry->d_inode->i_mutex);
        if (!error)
                d_delete(dentry);
-        dput(dentry);
        return error;
 }
diff --git a/fs/splice.c b/fs/splice.c
index 50a5d978da16..aa866d309695 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
        .get = generic_pipe_buf_get,
 };
+static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+        smp_mb();
+        if (waitqueue_active(&pipe->wait))
+                wake_up_interruptible(&pipe->wait);
+        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
 /**
 * splice_to_pipe - fill passed data into a pipe
 * @pipe:       pipe to fill
@@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
        pipe_unlock(pipe);
-        if (do_wakeup) {
+        if (do_wakeup)
-                smp_mb();
+                wakeup_pipe_readers(pipe);
-                if (waitqueue_active(&pipe->wait))
-                        wake_up_interruptible(&pipe->wait);
-                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-        }
        while (page_nr < spd_pages)
                spd->spd_release(spd, page_nr++);
@@ -1892,12 +1896,9 @@ retry:
        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
-        if (ret > 0) {
+        if (ret > 0)
-                smp_mb();
+                wakeup_pipe_readers(opipe);
-                if (waitqueue_active(&opipe->wait))
-                        wake_up_interruptible(&opipe->wait);
-                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
-        }
        if (input_wakeup)
                wakeup_pipe_writers(ipipe);
@@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
-        if (ret > 0) {
+        if (ret > 0)
-                smp_mb();
+                wakeup_pipe_readers(opipe);
-                if (waitqueue_active(&opipe->wait))
-                        wake_up_interruptible(&opipe->wait);
-                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
-        }
        return ret;
 }
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index efc309fa3035..7797218d0b30 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -42,7 +42,7 @@ config SQUASHFS_LZO
        select LZO_DECOMPRESS
        help
          Saying Y here includes support for reading Squashfs file systems
-          compressed with LZO compresssion.  LZO compression is mainly
+          compressed with LZO compression.  LZO compression is mainly
          aimed at embedded systems with slower CPUs where the overheads
          of zlib are too high.
@@ -57,7 +57,7 @@ config SQUASHFS_XZ
        select XZ_DEC
        help
          Saying Y here includes support for reading Squashfs file systems
-          compressed with XZ compresssion.  XZ gives better compression than
+          compressed with XZ compression.  XZ gives better compression than
          the default zlib compression, at the expense of greater CPU and
          memory overhead.
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8ab48bc2fa7d..ed0eb2a921f4 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index c37b520132ff..f744be98cd5a 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -29,7 +29,7 @@
 * plus functions layered ontop of the generic cache implementation to
 * access the metadata and fragment caches.
 *
- * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * To avoid out of memory and fragmentation issues with vmalloc the cache
 * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
 *
 * It should be noted that the cache is not used for file datablocks, these
@@ -393,19 +393,36 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
 /*
 * Read a filesystem table (uncompressed sequence of bytes) from disk
 */
-int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
+void *squashfs_read_table(struct super_block *sb, u64 block, int length)
-        int length)
 {
        int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        int i, res;
-        void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+        void *table, *buffer, **data;
-        if (data == NULL)
-                return -ENOMEM;
+        table = buffer = kmalloc(length, GFP_KERNEL);
+        if (table == NULL)
+                return ERR_PTR(-ENOMEM);
+        data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+        if (data == NULL) {
+                res = -ENOMEM;
+                goto failed;
+        }
        for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
                data[i] = buffer;
        res = squashfs_read_data(sb, data, block, length |
                SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
        kfree(data);
-        return res;
+        if (res < 0)
+                goto failed;
+        return table;
+failed:
+        kfree(table);
+        return ERR_PTR(res);
 }
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index e921bd213738..9f1b0bb96f13 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 099745ad5691..8ba70cff09a6 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -4,7 +4,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 3f79cd1d0c19..9dfe2ce0fb70 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 7f93d5a9ee05..730c56248c9b 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -121,30 +121,38 @@ static struct dentry *squashfs_get_parent(struct dentry *child)
 * Read uncompressed inode lookup table indexes off disk into memory
 */
 __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
-                u64 lookup_table_start, unsigned int inodes)
+                u64 lookup_table_start, u64 next_table, unsigned int inodes)
 {
        unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
-        __le64 *inode_lookup_table;
+        __le64 *table;
-        int err;
        TRACE("In read_inode_lookup_table, length %d\n", length);
-        /* Allocate inode lookup table indexes */
+        /* Sanity check values */
-        inode_lookup_table = kmalloc(length, GFP_KERNEL);
-        if (inode_lookup_table == NULL) {
+        /* there should always be at least one inode */
-                ERROR("Failed to allocate inode lookup table\n");
+        if (inodes == 0)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(-EINVAL);
-        }
+        /* length bytes should not extend into the next table - this check
+         * also traps instances where lookup_table_start is incorrectly larger
+         * than the next table start
+         */
+        if (lookup_table_start + length > next_table)
+                return ERR_PTR(-EINVAL);
+        table = squashfs_read_table(sb, lookup_table_start, length);
-        err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
+        /*
-                        length);
+         * table[0] points to the first inode lookup table metadata block,
-        if (err < 0) {
+         * this should be less than lookup_table_start
-                ERROR("unable to read inode lookup table\n");
+         */
-                kfree(inode_lookup_table);
+        if (!IS_ERR(table) && table[0] >= lookup_table_start) {
-                return ERR_PTR(err);
+                kfree(table);
+                return ERR_PTR(-EINVAL);
        }
-        return inode_lookup_table;
+        return table;
 }
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index a25c5060bdcb..38bb1c640559 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7eef571443c6..1516a6490bfb 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -71,26 +71,29 @@ int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
 * Read the uncompressed fragment lookup table indexes off disk into memory
 */
 __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
-        u64 fragment_table_start, unsigned int fragments)
+        u64 fragment_table_start, u64 next_table, unsigned int fragments)
 {
        unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
-        __le64 *fragment_index;
+        __le64 *table;
-        int err;
-        /* Allocate fragment lookup table indexes */
+        /*
-        fragment_index = kmalloc(length, GFP_KERNEL);
+         * Sanity check, length bytes should not extend into the next table -
-        if (fragment_index == NULL) {
+         * this check also traps instances where fragment_table_start is
-                ERROR("Failed to allocate fragment index table\n");
+         * incorrectly larger than the next table start
-                return ERR_PTR(-ENOMEM);
+         */
-        }
+        if (fragment_table_start + length > next_table)
+                return ERR_PTR(-EINVAL);
+        table = squashfs_read_table(sb, fragment_table_start, length);
-        err = squashfs_read_table(sb, fragment_index, fragment_table_start,
+        /*
-                        length);
+         * table[0] points to the first fragment table metadata block, this
-        if (err < 0) {
+         * should be less than fragment_table_start
-                ERROR("unable to read fragment index table\n");
+         */
-                kfree(fragment_index);
+        if (!IS_ERR(table) && table[0] >= fragment_table_start) {
-                return ERR_PTR(err);
+                kfree(table);
+                return ERR_PTR(-EINVAL);
        }
-        return fragment_index;
+        return table;
 }
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index d8f32452638e..a70858e0fb44 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -66,27 +66,37 @@ int squashfs_get_id(struct super_block *sb, unsigned int index,
 * Read uncompressed id lookup table indexes from disk into memory
 */
 __le64 *squashfs_read_id_index_table(struct super_block *sb,
-                        u64 id_table_start, unsigned short no_ids)
+                u64 id_table_start, u64 next_table, unsigned short no_ids)
 {
        unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
-        __le64 *id_table;
+        __le64 *table;
-        int err;
        TRACE("In read_id_index_table, length %d\n", length);
-        /* Allocate id lookup table indexes */
+        /* Sanity check values */
-        id_table = kmalloc(length, GFP_KERNEL);
-        if (id_table == NULL) {
+        /* there should always be at least one id */
-                ERROR("Failed to allocate id index table\n");
+        if (no_ids == 0)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(-EINVAL);
-        }
+        /*
+         * length bytes should not extend into the next table - this check
+         * also traps instances where id_table_start is incorrectly larger
+         * than the next table start
+         */
+        if (id_table_start + length > next_table)
+                return ERR_PTR(-EINVAL);
+        table = squashfs_read_table(sb, id_table_start, length);
-        err = squashfs_read_table(sb, id_table, id_table_start, length);
+        /*
-        if (err < 0) {
+         * table[0] points to the first id lookup table metadata block, this
-                ERROR("unable to read id index table\n");
+         * should be less than id_table_start
-                kfree(id_table);
+         */
-                return ERR_PTR(err);
+        if (!IS_ERR(table) && table[0] >= id_table_start) {
+                kfree(table);
+                return ERR_PTR(-EINVAL);
        }
-        return id_table;
+        return table;
 }
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 62e63ad25075..04bebcaa2373 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5d922a6701ab..4bc63ac64bc0 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 1f2e608b8785..e3be6a71cfa7 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -44,24 +44,24 @@ extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
                                u64, int);
 extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
                                u64, int);
-extern int squashfs_read_table(struct super_block *, void *, u64, int);
+extern void *squashfs_read_table(struct super_block *, u64, int);
 /* decompressor.c */
 extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
 extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
 /* export.c */
-extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
                                unsigned int);
 /* fragment.c */
 extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
 extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
-                                u64, unsigned int);
+                                u64, u64, unsigned int);
 /* id.c */
 extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
-extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
                                unsigned short);
 /* inode.c */
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 4582c568ef4d..b4a4e539a08c 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -4,7 +4,7 @@
 * Squashfs
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index 359baefc01fc..73588e7700ed 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -4,7 +4,7 @@
 * Squashfs
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index d9037a5215f0..651f0b31d296 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -4,7 +4,7 @@
 * Squashfs
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5c8184c061a4..6f26abee3597 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -83,7 +83,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        long long root_inode;
        unsigned short flags;
        unsigned int fragments;
-        u64 lookup_table_start, xattr_id_table_start;
+        u64 lookup_table_start, xattr_id_table_start, next_table;
        int err;
        TRACE("Entered squashfs_fill_superblock\n");
@@ -95,12 +95,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        }
        msblk = sb->s_fs_info;
-        sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
-        if (sblk == NULL) {
-                ERROR("Failed to allocate squashfs_super_block\n");
-                goto failure;
-        }
        msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
        msblk->devblksize_log2 = ffz(~msblk->devblksize);
@@ -114,10 +108,12 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
         * of bytes_used) we need to set it to an initial sensible dummy value
         */
        msblk->bytes_used = sizeof(*sblk);
-        err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
+        sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk));
-        if (err < 0) {
+        if (IS_ERR(sblk)) {
                ERROR("unable to read squashfs_super_block\n");
+                err = PTR_ERR(sblk);
+                sblk = NULL;
                goto failed_mount;
        }
@@ -218,18 +214,61 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        /* Handle xattrs */
+        sb->s_xattr = squashfs_xattr_handlers;
+        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+        if (xattr_id_table_start == SQUASHFS_INVALID_BLK) {
+                next_table = msblk->bytes_used;
+                goto allocate_id_index_table;
+        }
+        /* Allocate and read xattr id lookup table */
+        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+        if (IS_ERR(msblk->xattr_id_table)) {
+                ERROR("unable to read xattr id index table\n");
+                err = PTR_ERR(msblk->xattr_id_table);
+                msblk->xattr_id_table = NULL;
+                if (err != -ENOTSUPP)
+                        goto failed_mount;
+        }
+        next_table = msblk->xattr_table;
+allocate_id_index_table:
        /* Allocate and read id index table */
        msblk->id_table = squashfs_read_id_index_table(sb,
-                le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
+                le64_to_cpu(sblk->id_table_start), next_table,
+                le16_to_cpu(sblk->no_ids));
        if (IS_ERR(msblk->id_table)) {
+                ERROR("unable to read id index table\n");
                err = PTR_ERR(msblk->id_table);
                msblk->id_table = NULL;
                goto failed_mount;
        }
+        next_table = msblk->id_table[0];
+        /* Handle inode lookup table */
+        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+        if (lookup_table_start == SQUASHFS_INVALID_BLK)
+                goto handle_fragments;
+        /* Allocate and read inode lookup table */
+        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+                lookup_table_start, next_table, msblk->inodes);
+        if (IS_ERR(msblk->inode_lookup_table)) {
+                ERROR("unable to read inode lookup table\n");
+                err = PTR_ERR(msblk->inode_lookup_table);
+                msblk->inode_lookup_table = NULL;
+                goto failed_mount;
+        }
+        next_table = msblk->inode_lookup_table[0];
+        sb->s_export_op = &squashfs_export_ops;
+handle_fragments:
        fragments = le32_to_cpu(sblk->fragments);
        if (fragments == 0)
-                goto allocate_lookup_table;
+                goto check_directory_table;
        msblk->fragment_cache = squashfs_cache_init("fragment",
                SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
@@ -240,45 +279,29 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        /* Allocate and read fragment index table */
        msblk->fragment_index = squashfs_read_fragment_index_table(sb,
-                le64_to_cpu(sblk->fragment_table_start), fragments);
+                le64_to_cpu(sblk->fragment_table_start), next_table, fragments);
        if (IS_ERR(msblk->fragment_index)) {
+                ERROR("unable to read fragment index table\n");
                err = PTR_ERR(msblk->fragment_index);
                msblk->fragment_index = NULL;
                goto failed_mount;
        }
+        next_table = msblk->fragment_index[0];
-allocate_lookup_table:
+check_directory_table:
-        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+        /* Sanity check directory_table */
-        if (lookup_table_start == SQUASHFS_INVALID_BLK)
+        if (msblk->directory_table >= next_table) {
-                goto allocate_xattr_table;
+                err = -EINVAL;
-        /* Allocate and read inode lookup table */
-        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
-                lookup_table_start, msblk->inodes);
-        if (IS_ERR(msblk->inode_lookup_table)) {
-                err = PTR_ERR(msblk->inode_lookup_table);
-                msblk->inode_lookup_table = NULL;
                goto failed_mount;
        }
-        sb->s_export_op = &squashfs_export_ops;
+        /* Sanity check inode_table */
+        if (msblk->inode_table >= msblk->directory_table) {
-allocate_xattr_table:
+                err = -EINVAL;
-        sb->s_xattr = squashfs_xattr_handlers;
+                goto failed_mount;
-        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
-        if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
-                goto allocate_root;
-        /* Allocate and read xattr id lookup table */
-        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
-                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
-        if (IS_ERR(msblk->xattr_id_table)) {
-                err = PTR_ERR(msblk->xattr_id_table);
-                msblk->xattr_id_table = NULL;
-                if (err != -ENOTSUPP)
-                        goto failed_mount;
        }
-allocate_root:
+        /* allocate root */
        root = new_inode(sb);
        if (!root) {
                err = -ENOMEM;
@@ -318,11 +341,6 @@ failed_mount:
        sb->s_fs_info = NULL;
        kfree(sblk);
        return err;
-failure:
-        kfree(sb->s_fs_info);
-        sb->s_fs_info = NULL;
-        return -ENOMEM;
 }
@@ -475,5 +493,5 @@ static const struct super_operations squashfs_super_ops = {
 module_init(init_squashfs_fs);
 module_exit(exit_squashfs_fs);
 MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
-MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
+MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>");
 MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index ec86434921e1..1191817264cc 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 3876c36699a1..92fcde7b4d61 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index b634efce4bde..c83f5d9ec125 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -31,6 +31,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
                u64 start, u64 *xattr_table_start, int *xattr_ids)
 {
        ERROR("Xattrs in filesystem, these will be ignored\n");
+        *xattr_table_start = start;
        return ERR_PTR(-ENOTSUPP);
 }
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index 05385dbe1465..c89607d690c4 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -67,34 +67,29 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
                u64 *xattr_table_start, int *xattr_ids)
 {
        unsigned int len;
-        __le64 *xid_table;
+        struct squashfs_xattr_id_table *id_table;
-        struct squashfs_xattr_id_table id_table;
-        int err;
+        id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+        if (IS_ERR(id_table))
+                return (__le64 *) id_table;
+        *xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
+        *xattr_ids = le32_to_cpu(id_table->xattr_ids);
+        kfree(id_table);
+        /* Sanity check values */
+        /* there is always at least one xattr id */
+        if (*xattr_ids == 0)
+                return ERR_PTR(-EINVAL);
+        /* xattr_table should be less than start */
+        if (*xattr_table_start >= start)
+                return ERR_PTR(-EINVAL);
-        err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
-        if (err < 0) {
-                ERROR("unable to read xattr id table\n");
-                return ERR_PTR(err);
-        }
-        *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
-        *xattr_ids = le32_to_cpu(id_table.xattr_ids);
        len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
        TRACE("In read_xattr_index_table, length %d\n", len);
-        /* Allocate xattr id lookup table indexes */
+        return squashfs_read_table(sb, start + sizeof(*id_table), len);
-        xid_table = kmalloc(len, GFP_KERNEL);
-        if (xid_table == NULL) {
-                ERROR("Failed to allocate xattr id index table\n");
-                return ERR_PTR(-ENOMEM);
-        }
-        err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
-        if (err < 0) {
-                ERROR("unable to read xattr id index table\n");
-                kfree(xid_table);
-                return ERR_PTR(err);
-        }
-        return xid_table;
 }
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index aa47a286d1f8..1760b7d108f6 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 517688b32ffa..55d918fd2d86 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/super.c b/fs/super.c
index 8a06881b1920..c75593953c52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,6 +31,7 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
 #include "internal.h"
@@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                s->s_maxbytes = MAX_NON_LFS;
                s->s_op = &default_op;
                s->s_time_gran = 1000000000;
+                s->cleancache_poolid = -1;
        }
 out:
        return s;
@@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s)
 {
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
+                cleancache_flush_fs(s);
                fs->kill_sb(s);
                /*
                 * We need to call rcu_barrier so all the delayed rcu free
@@ -948,8 +951,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
         * but s_maxbytes was an unsigned long long for many releases. Throw
         * this warning for a little while to try and catch filesystems that
-         * violate this rule. This warning should be either removed or
+         * violate this rule.
-         * converted to a BUG() in 2.6.34.
         */
        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
                "negative value (%lld)\n", type->name, sb->s_maxbytes);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index da3fefe91a8f..1ad8c93c1b85 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -24,13 +24,6 @@
 #include "sysfs.h"
-/* used in crash dumps to help with debugging */
-static char last_sysfs_file[PATH_MAX];
-void sysfs_printk_last_file(void)
-{
-        printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
-}
 /*
 * There's one sysfs_buffer for each open file and one
 * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -337,11 +330,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
        struct sysfs_buffer *buffer;
        const struct sysfs_ops *ops;
        int error = -EACCES;
-        char *p;
-        p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-        if (!IS_ERR(p))
-                memmove(last_sysfs_file, p, strlen(p) + 1);
        /* need attr_sd for attr and ops, its parent for kobj */
        if (!sysfs_get_active(attr_sd))
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index c8769dc222d8..194414f8298c 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -101,9 +101,9 @@ int sysfs_create_group(struct kobject *kobj,
 }
 /**
- * sysfs_update_group - given a directory kobject, create an attribute group
+ * sysfs_update_group - given a directory kobject, update an attribute group
- * @kobj:       The kobject to create the group on
+ * @kobj:       The kobject to update the group on
- * @grp:        The attribute group to create
+ * @grp:        The attribute group to update
 *
 * This function updates an attribute group.  Unlike
 * sysfs_create_group(), it will explicitly not warn or error if any
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..e2cc6756f3b1 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
        struct inode *inode = dentry->d_inode;
        int err = -ENOTEMPTY;
+        dentry_unhash(dentry);
        if (sysv_empty_dir(inode)) {
                err = sysv_unlink(dir, dentry);
                if (!err) {
@@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
        struct sysv_dir_entry * old_de;
        int err = -ENOENT;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_de = sysv_find_entry(old_dentry, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 8c4fc1425b3e..f67acbdda5e8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -22,16 +22,24 @@
 #include <linux/anon_inodes.h>
 #include <linux/timerfd.h>
 #include <linux/syscalls.h>
+#include <linux/rcupdate.h>
 struct timerfd_ctx {
        struct hrtimer tmr;
        ktime_t tintv;
+        ktime_t moffs;
        wait_queue_head_t wqh;
        u64 ticks;
        int expired;
        int clockid;
+        struct rcu_head rcu;
+        struct list_head clist;
+        bool might_cancel;
 };
+static LIST_HEAD(cancel_list);
+static DEFINE_SPINLOCK(cancel_lock);
 /*
 * This gets called when the timer event triggers. We set the "expired"
 * flag, but we do not re-arm the timer (in case it's necessary,
@@ -51,6 +59,63 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
        return HRTIMER_NORESTART;
 }
+/*
+ * Called when the clock was set to cancel the timers in the cancel
+ * list.
+ */
+void timerfd_clock_was_set(void)
+{
+        ktime_t moffs = ktime_get_monotonic_offset();
+        struct timerfd_ctx *ctx;
+        unsigned long flags;
+        rcu_read_lock();
+        list_for_each_entry_rcu(ctx, &cancel_list, clist) {
+                if (!ctx->might_cancel)
+                        continue;
+                spin_lock_irqsave(&ctx->wqh.lock, flags);
+                if (ctx->moffs.tv64 != moffs.tv64) {
+                        ctx->moffs.tv64 = KTIME_MAX;
+                        wake_up_locked(&ctx->wqh);
+                }
+                spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+        }
+        rcu_read_unlock();
+}
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+        if (ctx->might_cancel) {
+                ctx->might_cancel = false;
+                spin_lock(&cancel_lock);
+                list_del_rcu(&ctx->clist);
+                spin_unlock(&cancel_lock);
+        }
+}
+static bool timerfd_canceled(struct timerfd_ctx *ctx)
+{
+        if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+                return false;
+        ctx->moffs = ktime_get_monotonic_offset();
+        return true;
+}
+static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
+{
+        if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
+            (flags & TFD_TIMER_CANCEL_ON_SET)) {
+                if (!ctx->might_cancel) {
+                        ctx->might_cancel = true;
+                        spin_lock(&cancel_lock);
+                        list_add_rcu(&ctx->clist, &cancel_list);
+                        spin_unlock(&cancel_lock);
+                }
+        } else if (ctx->might_cancel) {
+                timerfd_remove_cancel(ctx);
+        }
+}
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
        ktime_t remaining;
@@ -59,11 +124,12 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
        return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
-static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
+static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
-                          const struct itimerspec *ktmr)
+                         const struct itimerspec *ktmr)
 {
        enum hrtimer_mode htmode;
        ktime_t texp;
+        int clockid = ctx->clockid;
        htmode = (flags & TFD_TIMER_ABSTIME) ?
                HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
@@ -72,19 +138,24 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
        ctx->expired = 0;
        ctx->ticks = 0;
        ctx->tintv = timespec_to_ktime(ktmr->it_interval);
-        hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
+        hrtimer_init(&ctx->tmr, clockid, htmode);
        hrtimer_set_expires(&ctx->tmr, texp);
        ctx->tmr.function = timerfd_tmrproc;
-        if (texp.tv64 != 0)
+        if (texp.tv64 != 0) {
                hrtimer_start(&ctx->tmr, texp, htmode);
+                if (timerfd_canceled(ctx))
+                        return -ECANCELED;
+        }
+        return 0;
 }
 static int timerfd_release(struct inode *inode, struct file *file)
 {
        struct timerfd_ctx *ctx = file->private_data;
+        timerfd_remove_cancel(ctx);
        hrtimer_cancel(&ctx->tmr);
-        kfree(ctx);
+        kfree_rcu(ctx, rcu);
        return 0;
 }
@@ -118,8 +189,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
                res = -EAGAIN;
        else
                res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
+        /*
+         * If clock has changed, we do not care about the
+         * ticks and we do not rearm the timer. Userspace must
+         * reevaluate anyway.
+         */
+        if (timerfd_canceled(ctx)) {
+                ctx->ticks = 0;
+                ctx->expired = 0;
+                res = -ECANCELED;
+        }
        if (ctx->ticks) {
                ticks = ctx->ticks;
                if (ctx->expired && ctx->tintv.tv64) {
                        /*
                         * If tintv.tv64 != 0, this is a periodic timer that
@@ -183,6 +267,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        init_waitqueue_head(&ctx->wqh);
        ctx->clockid = clockid;
        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+        ctx->moffs = ktime_get_monotonic_offset();
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
                               O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -199,6 +284,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
        struct file *file;
        struct timerfd_ctx *ctx;
        struct itimerspec ktmr, kotmr;
+        int ret;
        if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
                return -EFAULT;
@@ -213,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
                return PTR_ERR(file);
        ctx = file->private_data;
+        timerfd_setup_cancel(ctx, flags);
        /*
         * We need to stop the existing timer before reprogramming
         * it to the new values.
@@ -240,14 +328,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
        /*
         * Re-program the timer to the new value ...
         */
-        timerfd_setup(ctx, flags, &ktmr);
+        ret = timerfd_setup(ctx, flags, &ktmr);
        spin_unlock_irq(&ctx->wqh.lock);
        fput(file);
        if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
                return -EFAULT;
-        return 0;
+        return ret;
 }
 SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da531eb..315de66e52b2 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
        long long liab;
        spin_lock(&c->space_lock);
-        liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+        liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
        spin_unlock(&c->space_lock);
        return liab;
 }
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
        int idx_lebs;
        long long idx_size;
-        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+        idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
        /* And make sure we have thrice the index size of space reserved */
        idx_size += idx_size << 1;
        /*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
 * budgeted index space to the size of the current index, multiplies this by 3,
 * and makes sure this does not exceed the amount of free LEBs.
 *
- * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
 *    be large, because UBIFS does not do any index consolidation as long as
 *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
 *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
+ *    the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
 *
 * This function returns zero in case of success, and %-ENOSPC in case of
 * failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
               c->lst.taken_empty_lebs;
        if (unlikely(rsvd_idx_lebs > lebs)) {
                dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
-                         "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+                         "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
                         rsvd_idx_lebs);
                return -ENOSPC;
        }
        available = ubifs_calc_available(c, min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
        if (unlikely(available < outstanding)) {
                dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
        if (available - outstanding <= c->rp_size && !can_use_rp(c))
                return -ENOSPC;
-        c->min_idx_lebs = min_idx_lebs;
+        c->bi.min_idx_lebs = min_idx_lebs;
        return 0;
 }
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
 {
        int data_growth;
-        data_growth = req->new_ino  ? c->inode_budget : 0;
+        data_growth = req->new_ino  ? c->bi.inode_budget : 0;
        if (req->new_page)
-                data_growth += c->page_budget;
+                data_growth += c->bi.page_budget;
        if (req->new_dent)
-                data_growth += c->dent_budget;
+                data_growth += c->bi.dent_budget;
        data_growth += req->new_ino_d;
        return data_growth;
 }
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
 {
        int dd_growth;
-        dd_growth = req->dirtied_page ? c->page_budget : 0;
+        dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
        if (req->dirtied_ino)
-                dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+                dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
        if (req->mod_dent)
-                dd_growth += c->dent_budget;
+                dd_growth += c->bi.dent_budget;
        dd_growth += req->dirtied_ino_d;
        return dd_growth;
 }
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 again:
        spin_lock(&c->space_lock);
-        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->bi.idx_growth >= 0);
-        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->bi.data_growth >= 0);
-        ubifs_assert(c->budg_dd_growth >= 0);
+        ubifs_assert(c->bi.dd_growth >= 0);
-        if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+        if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
                dbg_budg("no space");
                spin_unlock(&c->space_lock);
                return -ENOSPC;
        }
-        c->budg_idx_growth += idx_growth;
+        c->bi.idx_growth += idx_growth;
-        c->budg_data_growth += data_growth;
+        c->bi.data_growth += data_growth;
-        c->budg_dd_growth += dd_growth;
+        c->bi.dd_growth += dd_growth;
        err = do_budget_space(c);
        if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
        }
        /* Restore the old values */
-        c->budg_idx_growth -= idx_growth;
+        c->bi.idx_growth -= idx_growth;
-        c->budg_data_growth -= data_growth;
+        c->bi.data_growth -= data_growth;
-        c->budg_dd_growth -= dd_growth;
+        c->bi.dd_growth -= dd_growth;
        spin_unlock(&c->space_lock);
        if (req->fast) {
@@ -506,9 +506,9 @@ again:
                        goto again;
                }
                dbg_budg("FS is full, -ENOSPC");
-                c->nospace = 1;
+                c->bi.nospace = 1;
                if (can_use_rp(c) || c->rp_size == 0)
-                        c->nospace_rp = 1;
+                        c->bi.nospace_rp = 1;
                smp_wmb();
        } else
                ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
 * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
 * since the index changes (which were budgeted for in @req->idx_growth) will
 * only be written to the media on commit, this function moves the index budget
- * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
- * zeroed by the commit operation.
+ * by the commit operation.
 */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
        if (!req->data_growth && !req->dd_growth)
                return;
-        c->nospace = c->nospace_rp = 0;
+        c->bi.nospace = c->bi.nospace_rp = 0;
        smp_wmb();
        spin_lock(&c->space_lock);
-        c->budg_idx_growth -= req->idx_growth;
+        c->bi.idx_growth -= req->idx_growth;
-        c->budg_uncommitted_idx += req->idx_growth;
+        c->bi.uncommitted_idx += req->idx_growth;
-        c->budg_data_growth -= req->data_growth;
+        c->bi.data_growth -= req->data_growth;
-        c->budg_dd_growth -= req->dd_growth;
+        c->bi.dd_growth -= req->dd_growth;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->bi.idx_growth >= 0);
-        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->bi.data_growth >= 0);
-        ubifs_assert(c->budg_dd_growth >= 0);
+        ubifs_assert(c->bi.dd_growth >= 0);
-        ubifs_assert(c->min_idx_lebs < c->main_lebs);
+        ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
-        ubifs_assert(!(c->budg_idx_growth & 7));
+        ubifs_assert(!(c->bi.idx_growth & 7));
-        ubifs_assert(!(c->budg_data_growth & 7));
+        ubifs_assert(!(c->bi.data_growth & 7));
-        ubifs_assert(!(c->budg_dd_growth & 7));
+        ubifs_assert(!(c->bi.dd_growth & 7));
        spin_unlock(&c->space_lock);
 }
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 {
        spin_lock(&c->space_lock);
        /* Release the index growth reservation */
-        c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+        c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
        /* Release the data growth reservation */
-        c->budg_data_growth -= c->page_budget;
+        c->bi.data_growth -= c->bi.page_budget;
        /* Increase the dirty data growth reservation instead */
-        c->budg_dd_growth += c->page_budget;
+        c->bi.dd_growth += c->bi.page_budget;
        /* And re-calculate the indexing space reservation */
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
 }
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
        memset(&req, 0, sizeof(struct ubifs_budget_req));
        /* The "no space" flags will be cleared because dd_growth is > 0 */
-        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
+        req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
        int rsvd_idx_lebs, lebs;
        long long available, outstanding, free;
-        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+        ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
-        available = ubifs_calc_available(c, c->min_idx_lebs);
+        available = ubifs_calc_available(c, c->bi.min_idx_lebs);
        /*
         * When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
         * Note, the calculations below are similar to what we have in
         * 'do_budget_space()', so refer there for comments.
         */
-        if (c->min_idx_lebs > c->lst.idx_lebs)
+        if (c->bi.min_idx_lebs > c->lst.idx_lebs)
-                rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+                rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
        else
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01ded7123..87cd0ead8633 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
        c->mst_node->root_len    = cpu_to_le32(zroot.len);
        c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
        c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
-        c->mst_node->index_size  = cpu_to_le64(c->old_idx_sz);
+        c->mst_node->index_size  = cpu_to_le64(c->bi.old_idx_sz);
        c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
        c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
        c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d3745dc45..0bb2bcef0de9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
-#include <linux/slab.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
 static char dbg_key_buf0[128];
 static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags;
 unsigned int ubifs_chk_flags;
 unsigned int ubifs_tst_flags;
-module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
 MODULE_PARM_DESC(debug_chks, "Debug check flags");
 MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
                printk(KERN_DEBUG "\t  big_lpt      %u\n",
                       !!(sup_flags & UBIFS_FLG_BIGLPT));
+                printk(KERN_DEBUG "\t  space_fixup  %u\n",
+                       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
                printk(KERN_DEBUG "\tmin_io_size    %u\n",
                       le32_to_cpu(sup->min_io_size));
                printk(KERN_DEBUG "\tleb_size       %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
        spin_unlock(&dbg_lock);
 }
-void dbg_dump_budg(struct ubifs_info *c)
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
 {
        int i;
        struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct ubifs_gced_idx_leb *idx_gc;
        long long available, outstanding, free;
-        ubifs_assert(spin_is_locked(&c->space_lock));
+        spin_lock(&c->space_lock);
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
+        printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
-               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
+               "total budget sum %lld\n", current->pid,
-               c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
+               bi->data_growth + bi->dd_growth,
-        printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
+               bi->data_growth + bi->dd_growth + bi->idx_growth);
-               "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
+        printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
-               c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
+               "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
-               c->freeable_cnt);
+               bi->idx_growth);
-        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
+        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
-               "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
+               "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
-               c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+               bi->uncommitted_idx);
+        printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+               bi->page_budget, bi->inode_budget, bi->dent_budget);
+        printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+               bi->nospace, bi->nospace_rp);
+        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+        if (bi != &c->bi)
+                /*
+                 * If we are dumping saved budgeting data, do not print
+                 * additional information which is about the current state, not
+                 * the old one which corresponded to the saved budgeting data.
+                 */
+                goto out_unlock;
+        printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+               c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
        printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
               "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
               atomic_long_read(&c->dirty_zn_cnt),
               atomic_long_read(&c->clean_zn_cnt));
-        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
-               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
        printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
               c->gc_lnum, c->ihead_lnum);
        /* If we are in R/O mode, journal heads do not exist */
        if (c->jheads)
                for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
        /* Print budgeting predictions */
-        available = ubifs_calc_available(c, c->min_idx_lebs);
+        available = ubifs_calc_available(c, c->bi.min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
        free = ubifs_get_free_space_nolock(c);
        printk(KERN_DEBUG "Budgeting predictions:\n");
        printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
               available, outstanding, free);
+out_unlock:
        spin_unlock(&dbg_lock);
+        spin_unlock(&c->space_lock);
 }
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
                if (bud->lnum == lp->lnum) {
                        int head = 0;
                        for (i = 0; i < c->jhead_cnt; i++) {
-                                if (lp->lnum == c->jheads[i].wbuf.lnum) {
+                                /*
+                                 * Note, if we are in R/O mode or in the middle
+                                 * of mounting/re-mounting, the write-buffers do
+                                 * not exist.
+                                 */
+                                if (c->jheads &&
+                                    lp->lnum == c->jheads[i].wbuf.lnum) {
                                        printk(KERN_CONT ", jhead %s",
                                               dbg_jhead(i));
                                        head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
        spin_lock(&c->space_lock);
        memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+        memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+        d->saved_idx_gc_cnt = c->idx_gc_cnt;
        /*
         * We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
 out:
        ubifs_msg("saved lprops statistics dump");
        dbg_dump_lstats(&d->saved_lst);
-        ubifs_get_lp_stats(c, &lst);
+        ubifs_msg("saved budgeting info dump");
+        dbg_dump_budg(c, &d->saved_bi);
+        ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
        ubifs_msg("current lprops statistics dump");
+        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
+        ubifs_msg("current budgeting info dump");
-        spin_lock(&c->space_lock);
+        dbg_dump_budg(c, &c->bi);
-        dbg_dump_budg(c);
-        spin_unlock(&c->space_lock);
        dump_stack();
        return -EINVAL;
 }
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
        struct rb_node **p, *parent = NULL;
        struct fsck_inode *fscki;
        ino_t inum = key_inum_flash(c, &ino->key);
+        struct inode *inode;
+        struct ubifs_inode *ui;
        p = &fsckd->inodes.rb_node;
        while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
        if (!fscki)
                return ERR_PTR(-ENOMEM);
+        inode = ilookup(c->vfs_sb, inum);
        fscki->inum = inum;
-        fscki->nlink = le32_to_cpu(ino->nlink);
+        /*
-        fscki->size = le64_to_cpu(ino->size);
+         * If the inode is present in the VFS inode cache, use it instead of
-        fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+         * the on-flash inode which might be out-of-date. E.g., the size might
-        fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+         * be out-of-date. If we do not do this, the following may happen, for
-        fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+         * example:
-        fscki->mode = le32_to_cpu(ino->mode);
+         *   1. A power cut happens
+         *   2. We mount the file-system R/O, the replay process fixes up the
+         *      inode size in the VFS cache, but on on-flash.
+         *   3. 'check_leaf()' fails because it hits a data node beyond inode
+         *      size.
+         */
+        if (!inode) {
+                fscki->nlink = le32_to_cpu(ino->nlink);
+                fscki->size = le64_to_cpu(ino->size);
+                fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+                fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+                fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+                fscki->mode = le32_to_cpu(ino->mode);
+        } else {
+                ui = ubifs_inode(inode);
+                fscki->nlink = inode->i_nlink;
+                fscki->size = inode->i_size;
+                fscki->xattr_cnt = ui->xattr_cnt;
+                fscki->xattr_sz = ui->xattr_size;
+                fscki->xattr_nms = ui->xattr_names;
+                fscki->mode = inode->i_mode;
+                iput(inode);
+        }
        if (S_ISDIR(fscki->mode)) {
                fscki->calc_sz = UBIFS_INO_NODE_SZ;
                fscki->calc_cnt = 2;
        }
        rb_link_node(&fscki->rb, parent, p);
        rb_insert_color(&fscki->rb, &fsckd->inodes);
        return fscki;
 }
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
                hashb = key_block(c, &sb->key);
                if (hasha > hashb) {
-                        ubifs_err("larger hash %u goes before %u", hasha, hashb);
+                        ubifs_err("larger hash %u goes before %u",
+                                  hasha, hashb);
                        goto error_dump;
                }
        }
@@ -2437,14 +2491,12 @@ error_dump:
        return 0;
 }
-static int invocation_cnt;
 int dbg_force_in_the_gaps(void)
 {
-        if (!dbg_force_in_the_gaps_enabled)
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
                return 0;
-        /* Force in-the-gaps every 8th commit */
-        return !((invocation_cnt++) & 0x7);
+        return !(random32() & 7);
 }
 /* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
                 int len, int check)
 {
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        return ubi_leb_read(desc, lnum, buf, offset, len, check);
 }
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
        int err, failing;
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        failing = do_fail(desc, lnum, 1);
        if (failing)
                cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
        if (err)
                return err;
        if (failing)
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
        int err;
        if (do_fail(desc, lnum, 1))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_change(desc, lnum, buf, len, dtype);
        if (err)
                return err;
        if (do_fail(desc, lnum, 1))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_erase(desc, lnum);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_unmap(desc, lnum);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
 int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
 {
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        return ubi_is_mapped(desc, lnum);
 }
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_map(desc, lnum, dtype);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
 static int open_debugfs_file(struct inode *inode, struct file *file)
 {
        file->private_data = inode->i_private;
-        return 0;
+        return nonseekable_open(inode, file);
 }
 static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
        if (file->f_path.dentry == d->dfs_dump_lprops)
                dbg_dump_lprops(c);
-        else if (file->f_path.dentry == d->dfs_dump_budg) {
+        else if (file->f_path.dentry == d->dfs_dump_budg)
-                spin_lock(&c->space_lock);
+                dbg_dump_budg(c, &c->bi);
-                dbg_dump_budg(c);
+        else if (file->f_path.dentry == d->dfs_dump_tnc) {
-                spin_unlock(&c->space_lock);
-        } else if (file->f_path.dentry == d->dfs_dump_tnc) {
                mutex_lock(&c->tnc_mutex);
                dbg_dump_tnc(c);
                mutex_unlock(&c->tnc_mutex);
        } else
                return -EINVAL;
-        *ppos += count;
        return count;
 }
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
        .open = open_debugfs_file,
        .write = write_debugfs_file,
        .owner = THIS_MODULE,
-        .llseek = default_llseek,
+        .llseek = no_llseek,
 };
 /**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493cac193d..a811ac4a26bb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 #ifdef CONFIG_UBIFS_FS_DEBUG
+#include <linux/random.h>
 /**
 * ubifs_debug_info - per-FS debugging information.
 * @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 * @new_ihead_offs: used by debugging to check @c->ihead_offs
 *
 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
- * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
 *
- * dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
- * dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dir: direntry object of the file-system debugfs directory
- * dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
- * dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
- * dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
 */
 struct ubifs_debug_info {
        struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
        int new_ihead_offs;
        struct ubifs_lp_stats saved_lst;
+        struct ubifs_budg_info saved_bi;
        long long saved_free;
+        int saved_idx_gc_cnt;
        char dfs_dir_name[100];
        struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
        }                                                                      \
 } while (0)
-#define dbg_dump_stack() do {                                                  \
+#define dbg_dump_stack() dump_stack()
-        if (!dbg_failure_mode)                                                 \
-                dump_stack();                                                  \
-} while (0)
-/* Generic debugging messages */
-#define dbg_msg(fmt, ...) do {                                                 \
-        spin_lock(&dbg_lock);                                                  \
-        printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
-               __func__, ##__VA_ARGS__);                                       \
-        spin_unlock(&dbg_lock);                                                \
-} while (0)
-#define dbg_do_msg(typ, fmt, ...) do {                                         \
-        if (ubifs_msg_flags & typ)                                             \
-                dbg_msg(fmt, ##__VA_ARGS__);                                   \
-} while (0)
 #define dbg_err(fmt, ...) do {                                                 \
        spin_lock(&dbg_lock);                                                  \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
-/* General messages */
+#define ubifs_dbg_msg(type, fmt, ...) do {                        \
-#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+        spin_lock(&dbg_lock);                                     \
+        pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
+        spin_unlock(&dbg_lock);                                   \
+} while (0)
+/* Just a debugging messages not related to any specific UBIFS subsystem */
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+/* General messages */
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
 /* Additional log messages */
-#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
-/*
- * Debugging message type flags.
- *
- * UBIFS_MSG_GEN: general messages
- * UBIFS_MSG_JNL: journal messages
- * UBIFS_MSG_MNT: mount messages
- * UBIFS_MSG_CMT: commit messages
- * UBIFS_MSG_FIND: LEB find messages
- * UBIFS_MSG_BUDG: budgeting messages
- * UBIFS_MSG_GC: garbage collection messages
- * UBIFS_MSG_TNC: TNC messages
- * UBIFS_MSG_LP: lprops messages
- * UBIFS_MSG_IO: I/O messages
- * UBIFS_MSG_LOG: log messages
- * UBIFS_MSG_SCAN: scan messages
- * UBIFS_MSG_RCVRY: recovery messages
- */
-enum {
-        UBIFS_MSG_GEN   = 0x1,
-        UBIFS_MSG_JNL   = 0x2,
-        UBIFS_MSG_MNT   = 0x4,
-        UBIFS_MSG_CMT   = 0x8,
-        UBIFS_MSG_FIND  = 0x10,
-        UBIFS_MSG_BUDG  = 0x20,
-        UBIFS_MSG_GC    = 0x40,
-        UBIFS_MSG_TNC   = 0x80,
-        UBIFS_MSG_LP    = 0x100,
-        UBIFS_MSG_IO    = 0x200,
-        UBIFS_MSG_LOG   = 0x400,
-        UBIFS_MSG_SCAN  = 0x800,
-        UBIFS_MSG_RCVRY = 0x1000,
-};
 /*
 * Debugging check flags.
@@ -233,11 +186,9 @@ enum {
 /*
 * Special testing flags.
 *
- * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
 * UBIFS_TST_RCVRY: failure mode for recovery testing
 */
 enum {
-        UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
        UBIFS_TST_RCVRY             = 0x4,
 };
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
                       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
 void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
 int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 /* Force the use of in-the-gaps method for testing */
+static inline int dbg_force_in_the_gaps_enabled(void)
-#define dbg_force_in_the_gaps_enabled \
+{
-        (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
+        return ubifs_chk_flags & UBIFS_CHK_GEN;
+}
 int dbg_force_in_the_gaps(void);
 /* Failure mode for recovery testing */
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
 #ifndef UBIFS_DBG_PRESERVE_UBI
 #define ubi_leb_read   dbg_leb_read
 #define ubi_leb_write  dbg_leb_write
 #define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
 #define ubi_leb_unmap  dbg_leb_unmap
 #define ubi_is_mapped  dbg_is_mapped
 #define ubi_leb_map    dbg_leb_map
 #endif
 int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
                       __func__, __LINE__, current->pid);                      \
 } while (0)
-#define dbg_err(fmt, ...)   do {                                               \
+#define dbg_err(fmt, ...)   do {                   \
-        if (0)                                                                 \
+        if (0)                                     \
-                ubifs_err(fmt, ##__VA_ARGS__);                                 \
+                ubifs_err(fmt, ##__VA_ARGS__);     \
 } while (0)
-#define dbg_msg(fmt, ...) do {                                                 \
+#define ubifs_dbg_msg(fmt, ...) do {               \
-        if (0)                                                                 \
+        if (0)                                     \
-                printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
+                pr_debug(fmt "\n", ##__VA_ARGS__); \
-                       current->pid, __func__, ##__VA_ARGS__);                 \
 } while (0)
 #define dbg_dump_stack()
 #define ubifs_assert_cmt_locked(c)
-#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
 dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
 static inline void
 dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
-static inline void dbg_dump_budg(struct ubifs_info *c)            { return; }
+static inline void
+dbg_dump_budg(struct ubifs_info *c,
+              const struct ubifs_budg_info *bi)                   { return; }
 static inline void dbg_dump_lprop(const struct ubifs_info *c,
                                  const struct ubifs_lprops *lp)  { return; }
 static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
                              struct list_head *head)             { return 0; }
 static inline int dbg_force_in_the_gaps(void)                     { return 0; }
-#define dbg_force_in_the_gaps_enabled 0
+#define dbg_force_in_the_gaps_enabled() 0
-#define dbg_failure_mode              0
+#define dbg_failure_mode                0
 static inline int dbg_debugfs_init(void)                          { return 0; }
 static inline void dbg_debugfs_exit(void)                         { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67a80a6..c2b80943560d 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
                ubifs_release_budget(c, &req);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return 0;
@@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+        dentry_unhash(dentry);
        /*
         * Budget request settings: deletion direntry, deletion inode and
         * changing the parent inode. If budgeting fails, go ahead anyway
@@ -693,7 +695,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
                ubifs_release_budget(c, &req);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return 0;
@@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
        struct timespec time;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        /*
         * Budget request settings: deletion direntry, new direntry, removing
         * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db79c686..5e7fccfc4b29 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
 */
 static void release_existing_page_budget(struct ubifs_info *c)
 {
-        struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+        struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
        ubifs_release_budget(c, &req);
 }
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
 * the page locked, and it locks @ui_mutex. However, write-back does take inode
 * @i_mutex, which means other VFS operations may be run on this inode at the
 * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
- * drops the truncated pages. And while dropping the pages, it takes the page
+ * then drops the truncated pages. And while dropping the pages, it takes the
- * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
- * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
- * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
 *
 * XXX(truncate): with the new truncate sequence this is not true anymore,
 * and the calls to truncate_setsize can be move around freely.  They should
@@ -1189,7 +1189,7 @@ out_budg:
        if (budgeted)
                ubifs_release_budget(c, &req);
        else {
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
        dbg_gen("syncing inode %lu", inode->i_ino);
-        if (inode->i_sb->s_flags & MS_RDONLY)
+        if (c->ro_mount)
+                /*
+                 * For some really strange reasons VFS does not filter out
+                 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+                 */
                return 0;
        /*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 }
 /*
- * mmap()d file has taken write protection fault and is being made
+ * mmap()d file has taken write protection fault and is being made writable.
- * writable. UBIFS must ensure page is budgeted for.
+ * UBIFS must ensure page is budgeted for.
 */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+                                 struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int err;
-        /* 'generic_file_mmap()' takes care of NOMMU case */
        err = generic_file_mmap(file, vma);
        if (err)
                return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383d1269..2559d174e004 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                 * But if the index takes fewer LEBs than it is reserved for it,
                 * this function must avoid picking those reserved LEBs.
                 */
-                if (c->min_idx_lebs >= c->lst.idx_lebs) {
+                if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
-                        rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+                        rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
                        exclude_index = 1;
                }
                spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                        pick_free = 0;
        } else {
                spin_lock(&c->space_lock);
-                exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+                exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
                spin_unlock(&c->space_lock);
        }
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
        /* Check if there are enough empty LEBs for commit */
        spin_lock(&c->space_lock);
-        if (c->min_idx_lebs > c->lst.idx_lebs)
+        if (c->bi.min_idx_lebs > c->lst.idx_lebs)
-                rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+                rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
        else
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f10882820..ded29f6224c2 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
        if (err)
                return err;
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (err)
+                return err;
        err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
        if (err)
                return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
 * This function compares data nodes @a and @b. Returns %1 if @a has greater
 * inode or block number, and %-1 otherwise.
 */
-int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 * first and sorted by length in descending order. Directory entry nodes go
 * after inode nodes and are sorted in ascending hash valuer order.
 */
-int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+                             struct list_head *b)
 {
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
        ubifs_assert(c->gc_lnum != lnum);
        ubifs_assert(wbuf->lnum != lnum);
+        if (lp->free + lp->dirty == c->leb_size) {
+                /* Special case - a free LEB  */
+                dbg_gc("LEB %d is free, return it", lp->lnum);
+                ubifs_assert(!(lp->flags & LPROPS_INDEX));
+                if (lp->free != c->leb_size) {
+                        /*
+                         * Write buffers must be sync'd before unmapping
+                         * freeable LEBs, because one of them may contain data
+                         * which obsoletes something in 'lp->pnum'.
+                         */
+                        err = gc_sync_wbufs(c);
+                        if (err)
+                                return err;
+                        err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+                                                  0, 0, 0, 0);
+                        if (err)
+                                return err;
+                }
+                err = ubifs_leb_unmap(c, lp->lnum);
+                if (err)
+                        return err;
+                if (c->gc_lnum == -1) {
+                        c->gc_lnum = lnum;
+                        return LEB_RETAINED;
+                }
+                return LEB_FREED;
+        }
        /*
         * We scan the entire LEB even though we only really need to scan up to
         * (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
                       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
                       lp.free + lp.dirty, min_space);
-                if (lp.free + lp.dirty == c->leb_size) {
-                        /* An empty LEB was returned */
-                        dbg_gc("LEB %d is free, return it", lp.lnum);
-                        /*
-                         * ubifs_find_dirty_leb() doesn't return freeable index
-                         * LEBs.
-                         */
-                        ubifs_assert(!(lp.flags & LPROPS_INDEX));
-                        if (lp.free != c->leb_size) {
-                                /*
-                                 * Write buffers must be sync'd before
-                                 * unmapping freeable LEBs, because one of them
-                                 * may contain data which obsoletes something
-                                 * in 'lp.pnum'.
-                                 */
-                                ret = gc_sync_wbufs(c);
-                                if (ret)
-                                        goto out;
-                                ret = ubifs_change_one_lp(c, lp.lnum,
-                                                          c->leb_size, 0, 0, 0,
-                                                          0);
-                                if (ret)
-                                        goto out;
-                        }
-                        ret = ubifs_leb_unmap(c, lp.lnum);
-                        if (ret)
-                                goto out;
-                        ret = lp.lnum;
-                        break;
-                }
                space_before = c->leb_size - wbuf->offs - wbuf->used;
                if (wbuf->lnum == -1)
                        space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b7807e..166951e0dcd3 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
-                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
        if (c->ro_error)
                return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 * @dtype: data type
 *
 * This function targets the write-buffer to logical eraseblock @lnum:@offs.
- * The write-buffer is synchronized if it is not empty. Returns zero in case of
+ * The write-buffer has to be empty. Returns zero in case of success and a
- * success and a negative error code in case of failure.
+ * negative error code in case of failure.
 */
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
                           int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
        ubifs_assert(offs >= 0 && offs <= c->leb_size);
        ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
        ubifs_assert(lnum != wbuf->lnum);
+        ubifs_assert(wbuf->used == 0);
-        if (wbuf->used > 0) {
-                int err = ubifs_wbuf_sync_nolock(wbuf);
-                if (err)
-                        return err;
-        }
        spin_lock(&wbuf->lock);
        wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 {
        struct ubifs_info *c = wbuf->c;
-        int err, written, n, aligned_len = ALIGN(len, 8), offs;
+        int err, written, n, aligned_len = ALIGN(len, 8);
        dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
               dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
-                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
                err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                goto exit;
        }
-        offs = wbuf->offs;
        written = 0;
        if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                if (err)
                        goto out;
-                offs += wbuf->size;
+                wbuf->offs += wbuf->size;
                len -= wbuf->avail;
                aligned_len -= wbuf->avail;
                written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                if (err)
                        goto out;
-                offs += wbuf->size;
+                wbuf->offs += wbuf->size;
                len -= wbuf->size;
                aligned_len -= wbuf->size;
                written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        n = aligned_len >> c->max_write_shift;
        if (n) {
                n <<= c->max_write_shift;
-                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
+                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
-                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
+                       wbuf->offs);
-                                    wbuf->dtype);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
+                                    wbuf->offs, n, wbuf->dtype);
                if (err)
                        goto out;
-                offs += n;
+                wbuf->offs += n;
                aligned_len -= n;
                len -= n;
                written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                 */
                memcpy(wbuf->buf, buf + written, len);
-        wbuf->offs = offs;
        if (c->leb_size - wbuf->offs >= c->max_write_size)
                wbuf->size = c->max_write_size;
        else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e864227..34b1679e6e3a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
         * LEB with some empty space.
         */
        lnum = ubifs_find_free_space(c, len, &offs, squeeze);
-        if (lnum >= 0) {
+        if (lnum >= 0)
-                /* Found an LEB, add it to the journal head */
-                err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
-                if (err)
-                        goto out_return;
-                /* A new bud was successfully allocated and added to the log */
                goto out;
-        }
        err = lnum;
        if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
                return 0;
        }
-        err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
-        if (err)
-                goto out_return;
        offs = 0;
 out:
+        /*
+         * Make sure we synchronize the write-buffer before we add the new bud
+         * to the log. Otherwise we may have a power cut after the log
+         * reference node for the last bud (@lnum) is written but before the
+         * write-buffer data are written to the next-to-last bud
+         * (@wbuf->lnum). And the effect would be that the recovery would see
+         * that there is corruption in the next-to-last bud.
+         */
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (err)
+                goto out_return;
+        err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+        if (err)
+                goto out_return;
        err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
        if (err)
                goto out_unlock;
@@ -380,10 +385,8 @@ out:
        if (err == -ENOSPC) {
                /* This are some budgeting problems, print useful information */
                down_write(&c->commit_sem);
-                spin_lock(&c->space_lock);
                dbg_dump_stack();
-                dbg_dump_budg(c);
+                dbg_dump_budg(c, &c->bi);
-                spin_unlock(&c->space_lock);
                dbg_dump_lprops(c);
                cmt_retries = dbg_check_lprops(c);
                up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780ebea7..affea9494ae2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
 }
 /**
- * next_log_lnum - switch to the next log LEB.
- * @c: UBIFS file-system description object
- * @lnum: current log LEB
- */
-static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
-{
-        lnum += 1;
-        if (lnum > c->log_last)
-                lnum = UBIFS_LOG_LNUM;
-        return lnum;
-}
-/**
 * empty_log_bytes - calculate amount of empty space in the log.
 * @c: UBIFS file-system description object
 */
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        ref->jhead = cpu_to_le32(jhead);
        if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        /* Switch to the next log LEB */
        if (c->lhead_offs) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        c->lhead_offs += len;
        if (c->lhead_offs == c->leb_size) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
        }
        mutex_lock(&c->log_mutex);
        for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
-             lnum = next_log_lnum(c, lnum)) {
+             lnum = ubifs_next_log_lnum(c, lnum)) {
                dbg_log("unmap log LEB %d", lnum);
                err = ubifs_leb_unmap(c, lnum);
                if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
                err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
                if (err)
                        return err;
-                *lnum = next_log_lnum(c, *lnum);
+                *lnum = ubifs_next_log_lnum(c, *lnum);
                *offs = 0;
        }
        memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
                ubifs_scan_destroy(sleb);
                if (lnum == c->lhead_lnum)
                        break;
-                lnum = next_log_lnum(c, lnum);
+                lnum = ubifs_next_log_lnum(c, lnum);
        }
        if (offs) {
                int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
        /* Unmap remaining LEBs */
        lnum = write_lnum;
        do {
-                lnum = next_log_lnum(c, lnum);
+                lnum = ubifs_next_log_lnum(c, lnum);
                err = ubifs_leb_unmap(c, lnum);
                if (err)
                        return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847f2421..667884f4a615 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
 }
 /**
- * struct scan_check_data - data provided to scan callback function.
- * @lst: LEB properties statistics
- * @err: error code
- */
-struct scan_check_data {
-        struct ubifs_lp_stats lst;
-        int err;
-};
-/**
 * scan_check_cb - scan callback.
 * @c: the UBIFS file-system description object
 * @lp: LEB properties to scan
 * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @lst: lprops statistics to update
 *
 * This function returns a code that indicates whether the scan should continue
 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
 */
 static int scan_check_cb(struct ubifs_info *c,
                         const struct ubifs_lprops *lp, int in_tree,
-                         struct scan_check_data *data)
+                         struct ubifs_lp_stats *lst)
 {
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
-        struct ubifs_lp_stats *lst = &data->lst;
        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
        void *buf = NULL;
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
                if (cat != (lp->flags & LPROPS_CAT_MASK)) {
                        ubifs_err("bad LEB category %d expected %d",
                                  (lp->flags & LPROPS_CAT_MASK), cat);
-                        goto out;
+                        return -EINVAL;
                }
        }
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
                        }
                        if (!found) {
                                ubifs_err("bad LPT list (category %d)", cat);
-                                goto out;
+                                return -EINVAL;
                        }
                }
        }
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
                if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
                    lp != heap->arr[lp->hpos]) {
                        ubifs_err("bad LPT heap (category %d)", cat);
-                        goto out;
+                        return -EINVAL;
                }
        }
        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
-        if (!buf) {
+        if (!buf)
-                ubifs_err("cannot allocate memory to scan LEB %d", lnum);
+                return -ENOMEM;
-                goto out;
+        /*
+         * After an unclean unmount, empty and freeable LEBs
+         * may contain garbage - do not scan them.
+         */
+        if (lp->free == c->leb_size) {
+                lst->empty_lebs += 1;
+                lst->total_free += c->leb_size;
+                lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+                return LPT_SCAN_CONTINUE;
+        }
+        if (lp->free + lp->dirty == c->leb_size &&
+            !(lp->flags & LPROPS_INDEX)) {
+                lst->total_free  += lp->free;
+                lst->total_dirty += lp->dirty;
+                lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
+                return LPT_SCAN_CONTINUE;
        }
        sleb = ubifs_scan(c, lnum, 0, buf, 0);
        if (IS_ERR(sleb)) {
-                /*
+                ret = PTR_ERR(sleb);
-                 * After an unclean unmount, empty and freeable LEBs
+                if (ret == -EUCLEAN) {
-                 * may contain garbage.
+                        dbg_dump_lprops(c);
-                 */
+                        dbg_dump_budg(c, &c->bi);
-                if (lp->free == c->leb_size) {
-                        ubifs_err("scan errors were in empty LEB "
-                                  "- continuing checking");
-                        lst->empty_lebs += 1;
-                        lst->total_free += c->leb_size;
-                        lst->total_dark += ubifs_calc_dark(c, c->leb_size);
-                        ret = LPT_SCAN_CONTINUE;
-                        goto exit;
-                }
-                if (lp->free + lp->dirty == c->leb_size &&
-                    !(lp->flags & LPROPS_INDEX)) {
-                        ubifs_err("scan errors were in freeable LEB "
-                                  "- continuing checking");
-                        lst->total_free  += lp->free;
-                        lst->total_dirty += lp->dirty;
-                        lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
-                        ret = LPT_SCAN_CONTINUE;
-                        goto exit;
                }
-                data->err = PTR_ERR(sleb);
+                goto out;
-                ret = LPT_SCAN_STOP;
-                goto exit;
        }
        is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
        }
        ubifs_scan_destroy(sleb);
-        ret = LPT_SCAN_CONTINUE;
-exit:
        vfree(buf);
-        return ret;
+        return LPT_SCAN_CONTINUE;
 out_print:
        ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
        dbg_dump_leb(c, lnum);
 out_destroy:
        ubifs_scan_destroy(sleb);
+        ret = -EINVAL;
 out:
        vfree(buf);
-        data->err = -EINVAL;
+        return ret;
-        return LPT_SCAN_STOP;
 }
 /**
@@ -1278,8 +1260,7 @@ out:
 int dbg_check_lprops(struct ubifs_info *c)
 {
        int i, err;
-        struct scan_check_data data;
+        struct ubifs_lp_stats lst;
-        struct ubifs_lp_stats *lst = &data.lst;
        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
                return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
                        return err;
        }
-        memset(lst, 0, sizeof(struct ubifs_lp_stats));
+        memset(&lst, 0, sizeof(struct ubifs_lp_stats));
-        data.err = 0;
        err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
                                    (ubifs_lpt_scan_callback)scan_check_cb,
-                                    &data);
+                                    &lst);
        if (err && err != -ENOSPC)
                goto out;
-        if (data.err) {
-                err = data.err;
-                goto out;
-        }
-        if (lst->empty_lebs != c->lst.empty_lebs ||
+        if (lst.empty_lebs != c->lst.empty_lebs ||
-            lst->idx_lebs != c->lst.idx_lebs ||
+            lst.idx_lebs != c->lst.idx_lebs ||
-            lst->total_free != c->lst.total_free ||
+            lst.total_free != c->lst.total_free ||
-            lst->total_dirty != c->lst.total_dirty ||
+            lst.total_dirty != c->lst.total_dirty ||
-            lst->total_used != c->lst.total_used) {
+            lst.total_used != c->lst.total_used) {
                ubifs_err("bad overall accounting");
                ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
                          "total_free %lld, total_dirty %lld, total_used %lld",
-                          lst->empty_lebs, lst->idx_lebs, lst->total_free,
+                          lst.empty_lebs, lst.idx_lebs, lst.total_free,
-                          lst->total_dirty, lst->total_used);
+                          lst.total_dirty, lst.total_used);
                ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
                          "total_free %lld, total_dirty %lld, total_used %lld",
                          c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
                goto out;
        }
-        if (lst->total_dead != c->lst.total_dead ||
+        if (lst.total_dead != c->lst.total_dead ||
-            lst->total_dark != c->lst.total_dark) {
+            lst.total_dark != c->lst.total_dark) {
                ubifs_err("bad dead/dark space accounting");
                ubifs_err("calculated: total_dead %lld, total_dark %lld",
-                          lst->total_dead, lst->total_dark);
+                          lst.total_dead, lst.total_dark);
                ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
                          c->lst.total_dead, c->lst.total_dark);
                err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69bd983a..dfcb5748a7dc 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
 #include <linux/slab.h>
 #include "ubifs.h"
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_populate_lsave(struct ubifs_info *c);
+#else
+#define dbg_populate_lsave(c) 0
+#endif
 /**
 * first_dirty_cnode - find first dirty cnode.
 * @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
                        if (nnode->nbranch[iip].lnum)
                                break;
                }
-       } while (iip >= UBIFS_LPT_FANOUT);
+        } while (iip >= UBIFS_LPT_FANOUT);
        /* Go right */
        nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
                c->lpt_drty_flgs |= LSAVE_DIRTY;
                ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
        }
+        if (dbg_populate_lsave(c))
+                return;
        list_for_each_entry(lprops, &c->empty_list, list) {
                c->lsave[cnt++] = lprops->lnum;
                if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
               current->pid);
 }
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        int i;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        if (random32() & 3)
+                return 0;
+        for (i = 0; i < c->lsave_cnt; i++)
+                c->lsave[i] = c->main_first;
+        list_for_each_entry(lprops, &c->empty_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        list_for_each_entry(lprops, &c->freeable_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        list_for_each_entry(lprops, &c->frdi_idx_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        heap = &c->lpt_heap[LPROPS_FREE - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        return 1;
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47afdacff..278c2382e8c2 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
        }
        main_sz = (long long)c->main_lebs * c->leb_size;
-        if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+        if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
                err = 9;
                goto out;
        }
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
        }
        if (c->lst.total_dead + c->lst.total_dark +
-            c->lst.total_used + c->old_idx_sz > main_sz) {
+            c->lst.total_used + c->bi.old_idx_sz > main_sz) {
                err = 21;
                goto out;
        }
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
        c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
        c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
        c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
-        c->old_idx_sz      = le64_to_cpu(c->mst_node->index_size);
+        c->bi.old_idx_sz   = le64_to_cpu(c->mst_node->index_size);
        c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
        c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
        c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
        c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
        c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
-        c->calc_idx_sz = c->old_idx_sz;
+        c->calc_idx_sz = c->bi.old_idx_sz;
        if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
                c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04dc952a..0b5296a9a4c5 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
        mutex_unlock(&c->lp_mutex);
 }
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+        lnum += 1;
+        if (lnum > c->log_last)
+                lnum = UBIFS_LOG_LNUM;
+        return lnum;
+}
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318e368f..bd644bf587a8 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
                sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
                if (IS_ERR(sleb)) {
                        if (PTR_ERR(sleb) == -EUCLEAN)
-                                sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+                                sleb = ubifs_recover_leb(c, lnum, 0,
+                                                         c->sbuf, 0);
                        if (IS_ERR(sleb)) {
                                err = PTR_ERR(sleb);
                                break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6fbd1eb..731d9e2e7b50 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 /**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
 * @sleb: scanned LEB information
 * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
 *
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
 */
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 {
        int dropped = 0;
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
                kfree(snod);
                sleb->nodes_cnt -= 1;
                dropped = 1;
+                if (!grouped)
+                        break;
        }
        return dropped;
 }
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int grouped)
 {
-        int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
+        int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
-        int empty_chkd = 0, start = offs;
        struct ubifs_scan_leb *sleb;
        void *buf = sbuf + offs;
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
        if (IS_ERR(sleb))
                return sleb;
-        if (sleb->ecc)
+        ubifs_assert(len >= 8);
-                need_clean = 1;
        while (len >= 8) {
-                int ret;
                dbg_scan("look at LEB %d:%d (%d bytes left)",
                         lnum, offs, len);
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                 * Scan quietly until there is an error from which we cannot
                 * recover
                 */
-                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
                if (ret == SCANNED_A_NODE) {
                        /* A valid node, and not a padding node */
                        struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                        offs += node_len;
                        buf += node_len;
                        len -= node_len;
-                        continue;
+                } else if (ret > 0) {
-                }
-                if (ret > 0) {
                        /* Padding bytes or a valid padding node */
                        offs += ret;
                        buf += ret;
                        len -= ret;
-                        continue;
+                } else if (ret == SCANNED_EMPTY_SPACE ||
-                }
+                           ret == SCANNED_GARBAGE     ||
+                           ret == SCANNED_A_BAD_PAD_NODE ||
-                if (ret == SCANNED_EMPTY_SPACE) {
+                           ret == SCANNED_A_CORRUPT_NODE) {
-                        if (!is_empty(buf, len)) {
+                        dbg_rcvry("found corruption - %d", ret);
-                                if (!is_last_write(c, buf, offs))
-                                        break;
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                        }
-                        empty_chkd = 1;
                        break;
-                }
+                } else {
+                        dbg_err("unexpected return value %d", ret);
-                if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
-                        if (is_last_write(c, buf, offs)) {
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                                empty_chkd = 1;
-                                break;
-                        }
-                if (ret == SCANNED_A_CORRUPT_NODE)
-                        if (no_more_nodes(c, buf, len, lnum, offs)) {
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                                empty_chkd = 1;
-                                break;
-                        }
-                if (quiet) {
-                        /* Redo the last scan but noisily */
-                        quiet = 0;
-                        continue;
-                }
-                switch (ret) {
-                case SCANNED_GARBAGE:
-                        dbg_err("garbage");
-                        goto corrupted;
-                case SCANNED_A_CORRUPT_NODE:
-                case SCANNED_A_BAD_PAD_NODE:
-                        dbg_err("bad node");
-                        goto corrupted;
-                default:
-                        dbg_err("unknown");
                        err = -EINVAL;
                        goto error;
                }
        }
-        if (!empty_chkd && !is_empty(buf, len)) {
+        if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
-                if (is_last_write(c, buf, offs)) {
+                if (!is_last_write(c, buf, offs))
-                        clean_buf(c, &buf, lnum, &offs, &len);
+                        goto corrupted_rescan;
-                        need_clean = 1;
+        } else if (ret == SCANNED_A_CORRUPT_NODE) {
-                } else {
+                if (!no_more_nodes(c, buf, len, lnum, offs))
+                        goto corrupted_rescan;
+        } else if (!is_empty(buf, len)) {
+                if (!is_last_write(c, buf, offs)) {
                        int corruption = first_non_ff(buf, len);
                        /*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                }
        }
-        /* Drop nodes from incomplete group */
+        min_io_unit = round_down(offs, c->min_io_size);
-        if (grouped && drop_incomplete_group(sleb, &offs)) {
+        if (grouped)
-                buf = sbuf + offs;
+                /*
-                len = c->leb_size - offs;
+                 * If nodes are grouped, always drop the incomplete group at
-                clean_buf(c, &buf, lnum, &offs, &len);
+                 * the end.
-                need_clean = 1;
+                 */
-        }
+                drop_last_node(sleb, &offs, 1);
-        if (offs % c->min_io_size) {
+        /*
-                clean_buf(c, &buf, lnum, &offs, &len);
+         * While we are in the middle of the same min. I/O unit keep dropping
-                need_clean = 1;
+         * nodes. So basically, what we want is to make sure that the last min.
-        }
+         * I/O unit where we saw the corruption is dropped completely with all
+         * the uncorrupted node which may possibly sit there.
+         *
+         * In other words, let's name the min. I/O unit where the corruption
+         * starts B, and the previous min. I/O unit A. The below code tries to
+         * deal with a situation when half of B contains valid nodes or the end
+         * of a valid node, and the second half of B contains corrupted data or
+         * garbage. This means that UBIFS had been writing to B just before the
+         * power cut happened. I do not know how realistic is this scenario
+         * that half of the min. I/O unit had been written successfully and the
+         * other half not, but this is possible in our 'failure mode emulation'
+         * infrastructure at least.
+         *
+         * So what is the problem, why we need to drop those nodes? Whey can't
+         * we just clean-up the second half of B by putting a padding node
+         * there? We can, and this works fine with one exception which was
+         * reproduced with power cut emulation testing and happens extremely
+         * rarely. The description follows, but it is worth noting that that is
+         * only about the GC head, so we could do this trick only if the bud
+         * belongs to the GC head, but it does not seem to be worth an
+         * additional "if" statement.
+         *
+         * So, imagine the file-system is full, we run GC which is moving valid
+         * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+         * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+         * and will try to continue. Imagine that LEB X is currently the
+         * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+         * same as amount of free space in LEB X.
+         *
+         * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+         * are here trying to recover LEB Y which is the GC head LEB. We find
+         * the min. I/O unit B as described above. Then we clean-up LEB Y by
+         * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+         * fails, because it cannot find a dirty LEB which could be GC'd into
+         * LEB Y! Even LEB X does not match because the amount of valid nodes
+         * there does not fit the free space in LEB Y any more! And this is
+         * because of the padding node which we added to LEB Y. The
+         * user-visible effect of this which I once observed and analysed is
+         * that we cannot mount the file-system with -ENOSPC error.
+         *
+         * So obviously, to make sure that situation does not happen we should
+         * free min. I/O unit B in LEB Y completely and the last used min. I/O
+         * unit in LEB Y should be A. This is basically what the below code
+         * tries to do.
+         */
+        while (min_io_unit == round_down(offs, c->min_io_size) &&
+               min_io_unit != offs &&
+               drop_last_node(sleb, &offs, grouped));
+        buf = sbuf + offs;
+        len = c->leb_size - offs;
+        clean_buf(c, &buf, lnum, &offs, &len);
        ubifs_end_scan(c, sleb, lnum, offs);
-        if (need_clean) {
+        err = fix_unclean_leb(c, sleb, start);
-                err = fix_unclean_leb(c, sleb, start);
+        if (err)
-                if (err)
+                goto error;
-                        goto error;
-        }
        return sleb;
+corrupted_rescan:
+        /* Re-scan the corrupted data with verbose messages */
+        dbg_err("corruptio %d", ret);
+        ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
        ubifs_scanned_corruption(c, lnum, offs, buf);
        err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
 }
 /**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+        int lnum, err;
+        /*
+         * Note, it is very important to first search for an empty LEB and then
+         * run the commit, not vice-versa. The reason is that there might be
+         * only one empty LEB at the moment, the one which has been the
+         * @c->gc_lnum just before the power cut happened. During the regular
+         * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+         * one but GC can grab it. But at this moment this single empty LEB is
+         * not marked as taken, so if we run commit - what happens? Right, the
+         * commit will grab it and write the index there. Remember that the
+         * index always expands as long as there is free space, and it only
+         * starts consolidating when we run out of space.
+         *
+         * IOW, if we run commit now, we might not be able to find a free LEB
+         * after this.
+         */
+        lnum = ubifs_find_free_leb_for_idx(c);
+        if (lnum < 0) {
+                dbg_err("could not find an empty LEB");
+                dbg_dump_lprops(c);
+                dbg_dump_budg(c, &c->bi);
+                return lnum;
+        }
+        /* Reset the index flag */
+        err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+                                  LPROPS_INDEX, 0);
+        if (err)
+                return err;
+        c->gc_lnum = lnum;
+        dbg_rcvry("found empty LEB %d, run commit", lnum);
+        return ubifs_run_commit(c);
+}
+/**
 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
 * @c: UBIFS file-system description object
 *
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 {
        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        struct ubifs_lprops lp;
-        int lnum, err;
+        int err;
+        dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
        c->gc_lnum = -1;
-        if (wbuf->lnum == -1) {
+        if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
-                dbg_rcvry("no GC head LEB");
+                return grab_empty_leb(c);
-                goto find_free;
-        }
-        /*
-         * See whether the used space in the dirtiest LEB fits in the GC head
-         * LEB.
-         */
-        if (wbuf->offs == c->leb_size) {
-                dbg_rcvry("no room in GC head LEB");
-                goto find_free;
-        }
        err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
        if (err) {
-                /*
+                if (err != -ENOSPC)
-                 * There are no dirty or empty LEBs subject to here being
-                 * enough for the index. Try to use
-                 * 'ubifs_find_free_leb_for_idx()', which will return any empty
-                 * LEBs (ignoring index requirements). If the index then
-                 * doesn't have enough LEBs the recovery commit will fail -
-                 * which is the  same result anyway i.e. recovery fails. So
-                 * there is no problem ignoring index  requirements and just
-                 * grabbing a free LEB since we have already established there
-                 * is not a dirty LEB we could have used instead.
-                 */
-                if (err == -ENOSPC) {
-                        dbg_rcvry("could not find a dirty LEB");
-                        goto find_free;
-                }
-                return err;
-        }
-        ubifs_assert(!(lp.flags & LPROPS_INDEX));
-        lnum = lp.lnum;
-        if (lp.free + lp.dirty == c->leb_size) {
-                /* An empty LEB was returned */
-                if (lp.free != c->leb_size) {
-                        err = ubifs_change_one_lp(c, lnum, c->leb_size,
-                                                  0, 0, 0, 0);
-                        if (err)
-                                return err;
-                }
-                err = ubifs_leb_unmap(c, lnum);
-                if (err)
                        return err;
-                c->gc_lnum = lnum;
-                dbg_rcvry("allocated LEB %d for GC", lnum);
+                dbg_rcvry("could not find a dirty LEB");
-                /* Run the commit */
+                return grab_empty_leb(c);
-                dbg_rcvry("committing");
-                return ubifs_run_commit(c);
-        }
-        /*
-         * There was no empty LEB so the used space in the dirtiest LEB must fit
-         * in the GC head LEB.
-         */
-        if (lp.free + lp.dirty < wbuf->offs) {
-                dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
-                          lnum, wbuf->lnum, wbuf->offs);
-                err = ubifs_return_leb(c, lnum);
-                if (err)
-                        return err;
-                goto find_free;
        }
+        ubifs_assert(!(lp.flags & LPROPS_INDEX));
+        ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
        /*
         * We run the commit before garbage collection otherwise subsequent
         * mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
        err = ubifs_run_commit(c);
        if (err)
                return err;
-        /*
-         * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
+        dbg_rcvry("GC'ing LEB %d", lp.lnum);
-         * - use locking to keep 'ubifs_assert()' happy.
-         */
-        dbg_rcvry("GC'ing LEB %d", lnum);
        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
        err = ubifs_garbage_collect_leb(c, &lp);
        if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
                        err = -EINVAL;
                return err;
        }
-        if (err != LEB_RETAINED) {
-                dbg_err("GC returned %d", err);
+        ubifs_assert(err == LEB_RETAINED);
+        if (err != LEB_RETAINED)
                return -EINVAL;
-        }
        err = ubifs_leb_unmap(c, c->gc_lnum);
        if (err)
                return err;
-        dbg_rcvry("allocated LEB %d for GC", lnum);
-        return 0;
-find_free:
+        dbg_rcvry("allocated LEB %d for GC", lp.lnum);
-        /*
+        return 0;
-         * There is no GC head LEB or the free space in the GC head LEB is too
-         * small, or there are not dirty LEBs. Allocate gc_lnum by calling
-         * 'ubifs_find_free_leb_for_idx()' so GC is not run.
-         */
-        lnum = ubifs_find_free_leb_for_idx(c);
-        if (lnum < 0) {
-                dbg_err("could not find an empty LEB");
-                return lnum;
-        }
-        /* And reset the index flag */
-        err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
-                                  LPROPS_INDEX, 0);
-        if (err)
-                return err;
-        c->gc_lnum = lnum;
-        dbg_rcvry("allocated LEB %d for GC", lnum);
-        /* Run the commit */
-        dbg_rcvry("committing");
-        return ubifs_run_commit(c);
 }
 /**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
        err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
        if (err)
                goto out;
-        dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+        dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
                  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
        return 0;
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
                                e->i_size = le64_to_cpu(ino->size);
                        }
                }
                if (e->exists && e->i_size < e->d_size) {
-                        if (!e->inode && c->ro_mount) {
+                        if (c->ro_mount) {
                                /* Fix the inode size and pin it in memory */
                                struct inode *inode;
+                                struct ubifs_inode *ui;
+                                ubifs_assert(!e->inode);
                                inode = ubifs_iget(c->vfs_sb, e->inum);
                                if (IS_ERR(inode))
                                        return PTR_ERR(inode);
+                                ui = ubifs_inode(inode);
                                if (inode->i_size < e->d_size) {
                                        dbg_rcvry("ino %lu size %lld -> %lld",
                                                  (unsigned long)e->inum,
-                                                  e->d_size, inode->i_size);
+                                                  inode->i_size, e->d_size);
                                        inode->i_size = e->d_size;
-                                        ubifs_inode(inode)->ui_size = e->d_size;
+                                        ui->ui_size = e->d_size;
+                                        ui->synced_i_size = e->d_size;
                                        e->inode = inode;
                                        this = rb_next(this);
                                        continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
                                        iput(e->inode);
                        }
                }
                this = rb_next(this);
                rb_erase(&e->rb, &c->size_tree);
                kfree(e);
        }
        return 0;
 }
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d365bfc1..6617280d1679 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
 */
 #include "ubifs.h"
+#include <linux/list_sort.h>
-/*
- * Replay flags.
- *
- * REPLAY_DELETION: node was deleted
- * REPLAY_REF: node is a reference node
- */
-enum {
-        REPLAY_DELETION = 1,
-        REPLAY_REF = 2,
-};
 /**
- * struct replay_entry - replay tree entry.
+ * struct replay_entry - replay list entry.
 * @lnum: logical eraseblock number of the node
 * @offs: node offset
 * @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
 * @sqnum: node sequence number
- * @flags: replay flags
+ * @list: links the replay list
- * @rb: links the replay tree
 * @key: node key
 * @nm: directory entry name
 * @old_size: truncation old size
 * @new_size: truncation new size
- * @free: amount of free space in a bud
- * @dirty: amount of dirty space in a bud from padding and deletion nodes
- * @jhead: journal head number of the bud
 *
- * UBIFS journal replay must compare node sequence numbers, which means it must
+ * The replay process first scans all buds and builds the replay list, then
- * build a tree of node information to insert into the TNC.
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
 */
 struct replay_entry {
        int lnum;
        int offs;
        int len;
+        unsigned int deletion:1;
        unsigned long long sqnum;
-        int flags;
+        struct list_head list;
-        struct rb_node rb;
        union ubifs_key key;
        union {
                struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
                        loff_t old_size;
                        loff_t new_size;
                };
-                struct {
-                        int free;
-                        int dirty;
-                        int jhead;
-                };
        };
 };
@@ -90,57 +73,64 @@ struct replay_entry {
 * struct bud_entry - entry in the list of buds to replay.
 * @list: next bud in the list
 * @bud: bud description object
- * @free: free bytes in the bud
 * @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
 */
 struct bud_entry {
        struct list_head list;
        struct ubifs_bud *bud;
-        int free;
        unsigned long long sqnum;
+        int free;
+        int dirty;
 };
 /**
 * set_bud_lprops - set free and dirty space used by a bud.
 * @c: UBIFS file-system description object
- * @r: replay entry of bud
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
 */
-static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
 {
        const struct ubifs_lprops *lp;
        int err = 0, dirty;
        ubifs_get_lprops(c);
-        lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+        lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
        if (IS_ERR(lp)) {
                err = PTR_ERR(lp);
                goto out;
        }
        dirty = lp->dirty;
-        if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+        if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
                /*
                 * The LEB was added to the journal with a starting offset of
                 * zero which means the LEB must have been empty. The LEB
-                 * property values should be lp->free == c->leb_size and
+                 * property values should be @lp->free == @c->leb_size and
-                 * lp->dirty == 0, but that is not the case. The reason is that
+                 * @lp->dirty == 0, but that is not the case. The reason is that
-                 * the LEB was garbage collected. The garbage collector resets
+                 * the LEB had been garbage collected before it became the bud,
-                 * the free and dirty space without recording it anywhere except
+                 * and there was not commit inbetween. The garbage collector
-                 * lprops, so if there is not a commit then lprops does not have
+                 * resets the free and dirty space without recording it
-                 * that information next time the file system is mounted.
+                 * anywhere except lprops, so if there was no commit then
+                 * lprops does not have that information.
                 *
                 * We do not need to adjust free space because the scan has told
                 * us the exact value which is recorded in the replay entry as
-                 * r->free.
+                 * @b->free.
                 *
                 * However we do need to subtract from the dirty space the
                 * amount of space that the garbage collector reclaimed, which
                 * is the whole LEB minus the amount of space that was free.
                 */
-                dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
                        lp->free, lp->dirty);
-                dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
                        lp->free, lp->dirty);
                dirty -= c->leb_size - lp->free;
                /*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                 */
                if (dirty != 0)
                        dbg_msg("LEB %d lp: %d free %d dirty "
-                                "replay: %d free %d dirty", r->lnum, lp->free,
+                                "replay: %d free %d dirty", b->bud->lnum,
-                                lp->dirty, r->free, r->dirty);
+                                lp->free, lp->dirty, b->free, b->dirty);
        }
-        lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+        lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
                             lp->flags | LPROPS_TAKEN, 0);
        if (IS_ERR(lp)) {
                err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
        }
        /* Make sure the journal head points to the latest bud */
-        err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
+        err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
-                                     c->leb_size - r->free, UBI_SHORTTERM);
+                                     b->bud->lnum, c->leb_size - b->free,
+                                     UBI_SHORTTERM);
 out:
        ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
 }
 /**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+        struct bud_entry *b;
+        int err;
+        list_for_each_entry(b, &c->replay_buds, list) {
+                err = set_bud_lprops(c, b);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+/**
 * trun_remove_range - apply a replay entry for a truncation to the TNC.
 * @c: UBIFS file-system description object
 * @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
 */
 static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 {
-        int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+        int err;
-        dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
+        dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
-                r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+                r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
        /* Set c->replay_sqnum to help deal with dangling branches. */
        c->replay_sqnum = r->sqnum;
-        if (r->flags & REPLAY_REF)
+        if (is_hash_key(c, &r->key)) {
-                err = set_bud_lprops(c, r);
+                if (r->deletion)
-        else if (is_hash_key(c, &r->key)) {
-                if (deletion)
                        err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
                else
                        err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
                                               r->len, &r->nm);
        } else {
-                if (deletion)
+                if (r->deletion)
                        switch (key_type(c, &r->key)) {
                        case UBIFS_INO_KEY:
                        {
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
                        return err;
                if (c->need_recovery)
-                        err = ubifs_recover_size_accum(c, &r->key, deletion,
+                        err = ubifs_recover_size_accum(c, &r->key, r->deletion,
                                                       r->new_size);
        }
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 }
 /**
- * destroy_replay_tree - destroy the replay.
+ * replay_entries_cmp - compare 2 replay entries.
- * @c: UBIFS file-system description object
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
 *
- * Destroy the replay tree.
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer.  Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
 */
-static void destroy_replay_tree(struct ubifs_info *c)
+static int replay_entries_cmp(void *priv, struct list_head *a,
+                              struct list_head *b)
 {
-        struct rb_node *this = c->replay_tree.rb_node;
+        struct replay_entry *ra, *rb;
-        struct replay_entry *r;
+        cond_resched();
-        while (this) {
+        if (a == b)
-                if (this->rb_left) {
+                return 0;
-                        this = this->rb_left;
-                        continue;
+        ra = list_entry(a, struct replay_entry, list);
-                } else if (this->rb_right) {
+        rb = list_entry(b, struct replay_entry, list);
-                        this = this->rb_right;
+        ubifs_assert(ra->sqnum != rb->sqnum);
-                        continue;
+        if (ra->sqnum > rb->sqnum)
-                }
+                return 1;
-                r = rb_entry(this, struct replay_entry, rb);
+        return -1;
-                this = rb_parent(this);
-                if (this) {
-                        if (this->rb_left == &r->rb)
-                                this->rb_left = NULL;
-                        else
-                                this->rb_right = NULL;
-                }
-                if (is_hash_key(c, &r->key))
-                        kfree(r->nm.name);
-                kfree(r);
-        }
-        c->replay_tree = RB_ROOT;
 }
 /**
- * apply_replay_tree - apply the replay tree to the TNC.
+ * apply_replay_list - apply the replay list to the TNC.
 * @c: UBIFS file-system description object
 *
- * Apply the replay tree.
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
- * Returns zero in case of success and a negative error code in case of
+ * success and a negative error code in case of failure.
- * failure.
 */
-static int apply_replay_tree(struct ubifs_info *c)
+static int apply_replay_list(struct ubifs_info *c)
 {
-        struct rb_node *this = rb_first(&c->replay_tree);
+        struct replay_entry *r;
+        int err;
-        while (this) {
+        list_sort(c, &c->replay_list, &replay_entries_cmp);
-                struct replay_entry *r;
-                int err;
+        list_for_each_entry(r, &c->replay_list, list) {
                cond_resched();
-                r = rb_entry(this, struct replay_entry, rb);
                err = apply_replay_entry(c, r);
                if (err)
                        return err;
-                this = rb_next(this);
        }
        return 0;
 }
 /**
- * insert_node - insert a node to the replay tree.
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+        struct replay_entry *r, *tmp;
+        list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+                if (is_hash_key(c, &r->key))
+                        kfree(r->nm.name);
+                list_del(&r->list);
+                kfree(r);
+        }
+}
+/**
+ * insert_node - insert a node to the replay list
 * @c: UBIFS file-system description object
 * @lnum: node logical eraseblock number
 * @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
 * @old_size: truncation old size
 * @new_size: truncation new size
 *
- * This function inserts a scanned non-direntry node to the replay tree. The
+ * This function inserts a scanned non-direntry node to the replay list. The
- * replay tree is an RB-tree containing @struct replay_entry elements which are
+ * replay list contains @struct replay_entry elements, and we sort this list in
- * indexed by the sequence number. The replay tree is applied at the very end
+ * sequence number order before applying it. The replay list is applied at the
- * of the replay process. Since the tree is sorted in sequence number order,
+ * very end of the replay process. Since the list is sorted in sequence number
- * the older modifications are applied first. This function returns zero in
+ * order, the older modifications are applied first. This function returns zero
- * case of success and a negative error code in case of failure.
+ * in case of success and a negative error code in case of failure.
 */
 static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
                       union ubifs_key *key, unsigned long long sqnum,
                       int deletion, int *used, loff_t old_size,
                       loff_t new_size)
 {
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
        struct replay_entry *r;
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                } else if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay");
-                return -EINVAL;
-        }
        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
        if (!r)
                return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
        r->lnum = lnum;
        r->offs = offs;
        r->len = len;
+        r->deletion = !!deletion;
        r->sqnum = sqnum;
-        r->flags = (deletion ? REPLAY_DELETION : 0);
+        key_copy(c, key, &r->key);
        r->old_size = old_size;
        r->new_size = new_size;
-        key_copy(c, key, &r->key);
-        rb_link_node(&r->rb, parent, p);
+        list_add_tail(&r->list, &c->replay_list);
-        rb_insert_color(&r->rb, &c->replay_tree);
        return 0;
 }
 /**
- * insert_dent - insert a directory entry node into the replay tree.
+ * insert_dent - insert a directory entry node into the replay list.
 * @c: UBIFS file-system description object
 * @lnum: node logical eraseblock number
 * @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
 * @deletion: non-zero if this is a deletion
 * @used: number of bytes in use in a LEB
 *
- * This function inserts a scanned directory entry node to the replay tree.
+ * This function inserts a scanned directory entry node or an extended
- * Returns zero in case of success and a negative error code in case of
+ * attribute entry to the replay list. Returns zero in case of success and a
- * failure.
+ * negative error code in case of failure.
- *
- * This function is also used for extended attribute entries because they are
- * implemented as directory entry nodes.
 */
 static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
                       union ubifs_key *key, const char *name, int nlen,
                       unsigned long long sqnum, int deletion, int *used)
 {
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
        struct replay_entry *r;
        char *nbuf;
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                }
-                if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay");
-                return -EINVAL;
-        }
        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
        if (!r)
                return -ENOMEM;
        nbuf = kmalloc(nlen + 1, GFP_KERNEL);
        if (!nbuf) {
                kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
        r->lnum = lnum;
        r->offs = offs;
        r->len = len;
+        r->deletion = !!deletion;
        r->sqnum = sqnum;
+        key_copy(c, key, &r->key);
        r->nm.len = nlen;
        memcpy(nbuf, name, nlen);
        nbuf[nlen] = '\0';
        r->nm.name = nbuf;
-        r->flags = (deletion ? REPLAY_DELETION : 0);
-        key_copy(c, key, &r->key);
-        ubifs_assert(!*p);
+        list_add_tail(&r->list, &c->replay_list);
-        rb_link_node(&r->rb, parent, p);
-        rb_insert_color(&r->rb, &c->replay_tree);
        return 0;
 }
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
 }
 /**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+        struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+        struct ubifs_bud *next;
+        uint32_t data;
+        int err;
+        if (list_is_last(&bud->list, &jh->buds_list))
+                return 1;
+        /*
+         * The following is a quirk to make sure we work correctly with UBIFS
+         * images used with older UBIFS.
+         *
+         * Normally, the last bud will be the last in the journal head's list
+         * of bud. However, there is one exception if the UBIFS image belongs
+         * to older UBIFS. This is fairly unlikely: one would need to use old
+         * UBIFS, then have a power cut exactly at the right point, and then
+         * try to mount this image with new UBIFS.
+         *
+         * The exception is: it is possible to have 2 buds A and B, A goes
+         * before B, and B is the last, bud B is contains no data, and bud A is
+         * corrupted at the end. The reason is that in older versions when the
+         * journal code switched the next bud (from A to B), it first added a
+         * log reference node for the new bud (B), and only after this it
+         * synchronized the write-buffer of current bud (A). But later this was
+         * changed and UBIFS started to always synchronize the write-buffer of
+         * the bud (A) before writing the log reference for the new bud (B).
+         *
+         * But because older UBIFS always synchronized A's write-buffer before
+         * writing to B, we can recognize this exceptional situation but
+         * checking the contents of bud B - if it is empty, then A can be
+         * treated as the last and we can recover it.
+         *
+         * TODO: remove this piece of code in a couple of years (today it is
+         * 16.05.2011).
+         */
+        next = list_entry(bud->list.next, struct ubifs_bud, list);
+        if (!list_is_last(&next->list, &jh->buds_list))
+                return 0;
+        err = ubi_read(c->ubi, next->lnum, (char *)&data,
+                       next->start, 4);
+        if (err)
+                return 0;
+        return data == 0xFFFFFFFF;
+}
+/**
 * replay_bud - replay a bud logical eraseblock.
 * @c: UBIFS file-system description object
- * @lnum: bud logical eraseblock number to replay
+ * @b: bud entry which describes the bud
- * @offs: bud start offset
- * @jhead: journal head to which this bud belongs
- * @free: amount of free space in the bud is returned here
- * @dirty: amount of dirty space from padding and deletion nodes is returned
- * here
 *
- * This function returns zero in case of success and a negative error code in
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
- * case of failure.
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
 */
-static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
-                      int *free, int *dirty)
 {
-        int err = 0, used = 0;
+        int is_last = is_last_bud(c, b->bud);
+        int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
-        struct ubifs_bud *bud;
-        dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
+        dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
-        if (c->need_recovery)
+                lnum, b->bud->jhead, offs, is_last);
-                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+        if (c->need_recovery && is_last)
+                /*
+                 * Recover only last LEBs in the journal heads, because power
+                 * cuts may cause corruptions only in these LEBs, because only
+                 * these LEBs could possibly be written to at the power cut
+                 * time.
+                 */
+                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+                                         b->bud->jhead != GCHD);
        else
                sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
        if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
                        goto out;
        }
-        bud = ubifs_search_bud(c, lnum);
+        ubifs_assert(ubifs_search_bud(c, lnum));
-        if (!bud)
-                BUG();
        ubifs_assert(sleb->endpt - offs >= used);
        ubifs_assert(sleb->endpt % c->min_io_size == 0);
-        *dirty = sleb->endpt - offs - used;
+        b->dirty = sleb->endpt - offs - used;
-        *free = c->leb_size - sleb->endpt;
+        b->free = c->leb_size - sleb->endpt;
+        dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
 out:
        ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
 }
 /**
- * insert_ref_node - insert a reference node to the replay tree.
- * @c: UBIFS file-system description object
- * @lnum: node logical eraseblock number
- * @offs: node offset
- * @sqnum: sequence number
- * @free: amount of free space in bud
- * @dirty: amount of dirty space from padding and deletion nodes
- * @jhead: journal head number for the bud
- *
- * This function inserts a reference node to the replay tree and returns zero
- * in case of success or a negative error code in case of failure.
- */
-static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
-                           unsigned long long sqnum, int free, int dirty,
-                           int jhead)
-{
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
-        struct replay_entry *r;
-        dbg_mnt("add ref LEB %d:%d", lnum, offs);
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                } else if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay tree");
-                return -EINVAL;
-        }
-        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
-        if (!r)
-                return -ENOMEM;
-        r->lnum = lnum;
-        r->offs = offs;
-        r->sqnum = sqnum;
-        r->flags = REPLAY_REF;
-        r->free = free;
-        r->dirty = dirty;
-        r->jhead = jhead;
-        rb_link_node(&r->rb, parent, p);
-        rb_insert_color(&r->rb, &c->replay_tree);
-        return 0;
-}
-/**
 * replay_buds - replay all buds.
 * @c: UBIFS file-system description object
 *
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
 static int replay_buds(struct ubifs_info *c)
 {
        struct bud_entry *b;
-        int err, uninitialized_var(free), uninitialized_var(dirty);
+        int err;
+        unsigned long long prev_sqnum = 0;
        list_for_each_entry(b, &c->replay_buds, list) {
-                err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
+                err = replay_bud(c, b);
-                                 &free, &dirty);
-                if (err)
-                        return err;
-                err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
-                                      free, dirty, b->bud->jhead);
                if (err)
                        return err;
+                ubifs_assert(b->sqnum > prev_sqnum);
+                prev_sqnum = b->sqnum;
        }
        return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
        if (err)
                goto out;
-        err = apply_replay_tree(c);
+        err = apply_replay_list(c);
+        if (err)
+                goto out;
+        err = set_buds_lprops(c);
        if (err)
                goto out;
        /*
-         * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+         * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
-         * to roughly estimate index growth. Things like @c->min_idx_lebs
+         * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
         * depend on it. This means we have to initialize it to make sure
         * budgeting works properly.
         */
-        c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+        c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
-        c->budg_uncommitted_idx *= c->max_idx_node_sz;
+        c->bi.uncommitted_idx *= c->max_idx_node_sz;
        ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
        dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
                "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
                (unsigned long)c->highest_inum);
 out:
-        destroy_replay_tree(c);
+        destroy_replay_list(c);
        destroy_bud_list(c);
        c->replaying = 0;
        return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b4729e51..c606f010e8df 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
 * @c: UBIFS file-system description object
 *
 * This function returns a pointer to the superblock node or a negative error
- * code.
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
 */
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
 {
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
        memcpy(&c->uuid, &sup->uuid, 16);
        c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+        c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
        /* Automatically increase file system size to the maximum size */
        c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
        kfree(sup);
        return err;
 }
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+        int err;
+        ubifs_assert(len >= 0);
+        ubifs_assert(len % c->min_io_size == 0);
+        ubifs_assert(len < c->leb_size);
+        if (len == 0) {
+                dbg_mnt("unmap empty LEB %d", lnum);
+                return ubi_leb_unmap(c->ubi, lnum);
+        }
+        dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+        err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+        if (err)
+                return err;
+        return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+}
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+        int lnum, err = 0;
+        struct ubifs_lprops *lprops;
+        ubifs_get_lprops(c);
+        /* Fixup LEBs in the master area */
+        for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+                err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+                if (err)
+                        goto out;
+        }
+        /* Unmap unused log LEBs */
+        lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+        while (lnum != c->ltail_lnum) {
+                err = fixup_leb(c, lnum, 0);
+                if (err)
+                        goto out;
+                lnum = ubifs_next_log_lnum(c, lnum);
+        }
+        /* Fixup the current log head */
+        err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+        if (err)
+                goto out;
+        /* Fixup LEBs in the LPT area */
+        for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+                int free = c->ltab[lnum - c->lpt_first].free;
+                if (free > 0) {
+                        err = fixup_leb(c, lnum, c->leb_size - free);
+                        if (err)
+                                goto out;
+                }
+        }
+        /* Unmap LEBs in the orphans area */
+        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+                err = fixup_leb(c, lnum, 0);
+                if (err)
+                        goto out;
+        }
+        /* Fixup LEBs in the main area */
+        for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+                lprops = ubifs_lpt_lookup(c, lnum);
+                if (IS_ERR(lprops)) {
+                        err = PTR_ERR(lprops);
+                        goto out;
+                }
+                if (lprops->free > 0) {
+                        err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+                        if (err)
+                                goto out;
+                }
+        }
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+        int err;
+        struct ubifs_sb_node *sup;
+        ubifs_assert(c->space_fixup);
+        ubifs_assert(!c->ro_mount);
+        ubifs_msg("start fixing up free space");
+        err = fixup_free_space(c);
+        if (err)
+                return err;
+        sup = ubifs_read_sb_node(c);
+        if (IS_ERR(sup))
+                return PTR_ERR(sup);
+        /* Free-space fixup is no longer required */
+        c->space_fixup = 0;
+        sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+        err = ubifs_write_sb_node(c, sup);
+        kfree(sup);
+        if (err)
+                return err;
+        ubifs_msg("free space fixup complete");
+        return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f4fcc3..6db0bdaa9f74 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
                ubifs_release_dirty_inode_budget(c, ui);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
 done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
         * be compressed and direntries are of the maximum size.
         *
         * Note, data, which may be stored in inodes is budgeted separately, so
-         * it is not included into 'c->inode_budget'.
+         * it is not included into 'c->bi.inode_budget'.
         */
-        c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+        c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
-        c->inode_budget = UBIFS_INO_NODE_SZ;
+        c->bi.inode_budget = UBIFS_INO_NODE_SZ;
-        c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+        c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
        /*
         * When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
 {
        long long tmp64;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        /*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
 {
        ubifs_assert(c->dark_wm > 0);
        if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
-                ubifs_err("insufficient free space to mount in read/write mode");
+                ubifs_err("insufficient free space to mount in R/W mode");
-                dbg_dump_budg(c);
+                dbg_dump_budg(c, &c->bi);
                dbg_dump_lprops(c);
                return -ENOSPC;
        }
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_lpt;
-        err = dbg_check_idx_size(c, c->old_idx_sz);
+        err = dbg_check_idx_size(c, c->bi.old_idx_sz);
        if (err)
                goto out_lpt;
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_journal;
        /* Calculate 'min_idx_lebs' after journal replay */
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
        if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
        } else
                ubifs_assert(c->lst.taken_empty_lebs > 0);
+        if (!c->ro_mount && c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out_infos;
+        }
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
                c->main_lebs, c->main_first, c->leb_cnt - 1);
        dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
        dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
-                c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+                c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+                c->bi.old_idx_sz >> 20);
        dbg_msg("key hash type:       %d", c->key_hash_type);
        dbg_msg("tree fanout:         %d", c->fanout);
        dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
-                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
                UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                }
                sup->leb_cnt = cpu_to_le32(c->leb_cnt);
                err = ubifs_write_sb_node(c, sup);
+                kfree(sup);
                if (err)
                        goto out;
        }
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                 */
                err = dbg_check_space_info(c);
        }
+        if (c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out;
+        }
        mutex_unlock(&c->umount_mutex);
        return err;
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
         * to write them back because of I/O errors.
         */
        if (!c->ro_error) {
-                ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+                ubifs_assert(c->bi.idx_growth == 0);
-                ubifs_assert(c->budg_idx_growth == 0);
+                ubifs_assert(c->bi.dd_growth == 0);
-                ubifs_assert(c->budg_dd_growth == 0);
+                ubifs_assert(c->bi.data_growth == 0);
-                ubifs_assert(c->budg_data_growth == 0);
        }
        /*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de485979ca39..8119b1fd8d94 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
                if (err) {
                        /* Ensure the znode is dirtied */
                        if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                                    znode = dirty_cow_bottom_up(c, znode);
+                                znode = dirty_cow_bottom_up(c, znode);
-                                    if (IS_ERR(znode)) {
+                                if (IS_ERR(znode)) {
-                                            err = PTR_ERR(znode);
+                                        err = PTR_ERR(znode);
-                                            goto out_unlock;
+                                        goto out_unlock;
-                                    }
+                                }
                        }
                        err = tnc_delete(c, znode, n);
                }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5d604e..41920f357bbf 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                                c->gap_lebs = NULL;
                                return err;
                        }
-                        if (!dbg_force_in_the_gaps_enabled) {
+                        if (dbg_force_in_the_gaps_enabled()) {
                                /*
                                 * Do not print scary warnings if the debugging
                                 * option which forces in-the-gaps is enabled.
                                 */
-                                ubifs_err("out of space");
+                                ubifs_warn("out of space");
-                                spin_lock(&c->space_lock);
+                                dbg_dump_budg(c, &c->bi);
-                                dbg_dump_budg(c);
-                                spin_unlock(&c->space_lock);
                                dbg_dump_lprops(c);
                        }
                        /* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
        spin_lock(&c->space_lock);
        /*
         * Although we have not finished committing yet, update size of the
-         * committed index ('c->old_idx_sz') and zero out the index growth
+         * committed index ('c->bi.old_idx_sz') and zero out the index growth
         * budget. It is OK to do this now, because we've reserved all the
         * space which is needed to commit the index, and it is save for the
         * budgeting subsystem to assume the index is already committed,
         * even though it is not.
         */
-        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+        ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-        c->old_idx_sz = c->calc_idx_sz;
+        c->bi.old_idx_sz = c->calc_idx_sz;
-        c->budg_uncommitted_idx = 0;
+        c->bi.uncommitted_idx = 0;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
        mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca7863fe7..e24380cf46ed 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
 * Superblock flags.
 *
 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
 */
 enum {
        UBIFS_FLG_BIGLPT = 0x02,
+        UBIFS_FLG_SPACE_FIXUP = 0x04,
 };
 /**
@@ -434,7 +436,7 @@ struct ubifs_ch {
        __u8 node_type;
        __u8 group_type;
        __u8 padding[2];
-} __attribute__ ((packed));
+} __packed;
 /**
 * union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
 union ubifs_dev_desc {
        __le32 new;
        __le64 huge;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
        __le16 compr_type;
        __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
        __u8 data[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
        __le16 nlen;
        __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
        __u8 name[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
        __le16 compr_type;
        __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
        __u8 data[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
        __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
        __le64 old_size;
        __le64 new_size;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
 struct ubifs_pad_node {
        struct ubifs_ch ch;
        __le32 pad_len;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
        __u8 uuid[16];
        __le32 ro_compat_version;
        __u8 padding2[3968];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
        __le32 idx_lebs;
        __le32 leb_cnt;
        __u8 padding[344];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
        __le32 offs;
        __le32 jhead;
        __u8 padding[28];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
        __le32 offs;
        __le32 len;
        __u8 key[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
        __le16 child_cnt;
        __le16 level;
        __u8 branches[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
 struct ubifs_cs_node {
        struct ubifs_ch ch;
        __le64 cmt_no;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
        struct ubifs_ch ch;
        __le64 cmt_no;
        __le64 inos[];
-} __attribute__ ((packed));
+} __packed;
 #endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3c6721..93d1412a06f0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
 * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
- * with 'ubifs_writepage()' (see file.c). All the other inode fields are
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
- * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
 * could consider to rework locking and base it on "shadow" fields.
 */
 struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
        unsigned int compr_type:2;
 };
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ *             other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ *                   which still have to be taken into account because the index
+ *                   has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ * @page_budget: budget for a page (constant, nenver changed after mount)
+ * @inode_budget: budget for an inode (constant, nenver changed after mount)
+ * @dent_budget: budget for a directory entry (constant, nenver changed after
+ *               mount)
+ */
+struct ubifs_budg_info {
+        long long idx_growth;
+        long long data_growth;
+        long long dd_growth;
+        long long uncommitted_idx;
+        unsigned long long old_idx_sz;
+        int min_idx_lebs;
+        unsigned int nospace:1;
+        unsigned int nospace_rp:1;
+        int page_budget;
+        int inode_budget;
+        int dent_budget;
+};
 struct ubifs_debug_info;
 /**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
 *
 * @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
 *                   recovery)
 * @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
 * @dirty_zn_cnt: number of dirty znodes
 * @clean_zn_cnt: number of clean znodes
 *
- * @budg_idx_growth: amount of bytes budgeted for index growth
+ * @space_lock: protects @bi and @lst
- * @budg_data_growth: amount of bytes budgeted for cached data
+ * @lst: lprops statistics
- * @budg_dd_growth: amount of bytes budgeted for cached data that will make
+ * @bi: budgeting information
- *                  other data dirty
- * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
- *                        but which still have to be taken into account because
- *                        the index has not been committed so far
- * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
- *              @nospace, and @nospace_rp;
- * @min_idx_lebs: minimum number of LEBs required for the index
- * @old_idx_sz: size of index on flash
 * @calc_idx_sz: temporary variable which is used to calculate new index size
 *               (contains accurate new index size at end of TNC commit start)
- * @lst: lprops statistics
- * @nospace: non-zero if the file-system does not have flash space (used as
- *           optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- *              pool is full
- *
- * @page_budget: budget for a page
- * @inode_budget: budget for an inode
- * @dent_budget: budget for a directory entry
 *
 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
- * I/O unit
+ *                 I/O unit
 * @mst_node_alsz: master node aligned size
 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
 * @replaying: %1 during journal replay
 * @mounting: %1 while mounting
 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
- * @replay_tree: temporary tree used during journal replay
 * @replay_list: temporary list used during journal replay
 * @replay_buds: list of buds to replay
 * @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
        wait_queue_head_t cmt_wq;
        unsigned int big_lpt:1;
+        unsigned int space_fixup:1;
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
        unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
        atomic_long_t dirty_zn_cnt;
        atomic_long_t clean_zn_cnt;
-        long long budg_idx_growth;
-        long long budg_data_growth;
-        long long budg_dd_growth;
-        long long budg_uncommitted_idx;
        spinlock_t space_lock;
-        int min_idx_lebs;
-        unsigned long long old_idx_sz;
-        unsigned long long calc_idx_sz;
        struct ubifs_lp_stats lst;
-        unsigned int nospace:1;
+        struct ubifs_budg_info bi;
-        unsigned int nospace_rp:1;
+        unsigned long long calc_idx_sz;
-        int page_budget;
-        int inode_budget;
-        int dent_budget;
        int ref_node_alsz;
        int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
        unsigned int replaying:1;
        unsigned int mounting:1;
        unsigned int remounting_rw:1;
-        struct rb_root replay_tree;
        struct list_head replay_list;
        struct list_head replay_buds;
        unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
 int ubifs_read_superblock(struct ubifs_info *c);
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
 int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
 /* replay.c */
 int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f469e712..16f19f55e63f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
        SECURITY_XATTR,
 };
-static const struct inode_operations none_inode_operations;
+static const struct inode_operations empty_iops;
-static const struct file_operations none_file_operations;
+static const struct file_operations empty_fops;
 /**
 * create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        /* Re-define all operations to be "nothing" */
        inode->i_mapping->a_ops = &empty_aops;
-        inode->i_op = &none_inode_operations;
+        inode->i_op = &empty_iops;
-        inode->i_fop = &none_file_operations;
+        inode->i_fop = &empty_fops;
        inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
        ui = ubifs_inode(inode);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..4d76594c2a8f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
+        dentry_unhash(dentry);
        retval = -ENOENT;
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
@@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 46f7a807bbc1..42694e11c23d 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -424,8 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                        ufs_cpu_to_data_ptr(sb, p, result);
                        *err = 0;
                        UFS_I(inode)->i_lastfrag =
-                                max_t(u32, UFS_I(inode)->i_lastfrag,
+                                max(UFS_I(inode)->i_lastfrag, fragment + count);
-                                      fragment + count);
                        ufs_clear_frags(inode, result + oldcount,
                                        newcount - oldcount, locked_page != NULL);
                }
@@ -440,7 +439,8 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
        result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
        if (result) {
                *err = 0;
-                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
+                UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+                                                fragment + count);
                ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
                                locked_page != NULL);
                unlock_super(sb);
@@ -479,7 +479,8 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                                   uspi->s_sbbase + result, locked_page);
                ufs_cpu_to_data_ptr(sb, p, result);
                *err = 0;
-                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
+                UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+                                                fragment + count);
                unlock_super(sb);
                if (newcount < request)
                        ufs_free_fragments (inode, result + newcount, request - newcount);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index e765743cf9f3..b4d791a83207 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -409,7 +409,7 @@ out:
 }
 /**
- * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
+ * ufs_getfrag_block() - `get_block_t' function, interface between UFS and
 * readpage, writepage and so on
 */
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 29309e25417f..953ebdfc5bf7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err= -ENOTEMPTY;
+        dentry_unhash(dentry);
        lock_ufs(dir->i_sb);
        if (ufs_empty_dir (inode)) {
                err = ufs_unlink(dir, dentry);
@@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 5f821dbc0579..f04f89fbd4d9 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -84,7 +84,7 @@ static int ufs_trunc_direct(struct inode *inode)
        retry = 0;
        
        frag1 = DIRECT_FRAGMENT;
-        frag4 = min_t(u32, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+        frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
        frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
        frag3 = frag4 & ~uspi->s_fpbmask;
        block1 = block2 = 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9ef9ed2cfe2e..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
-#include <linux/list_sort.h>
 #include "xfs_sb.h"
 #include "xfs_inum.h"
@@ -709,6 +708,27 @@ xfs_buf_get_empty(
        return bp;
 }
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+        struct xfs_buf          *bp,
+        size_t                  len)
+{
+        if (bp->b_pages)
+                _xfs_buf_free_pages(bp);
+        bp->b_pages = NULL;
+        bp->b_page_count = 0;
+        bp->b_addr = NULL;
+        bp->b_file_offset = 0;
+        bp->b_buffer_length = bp->b_count_desired = len;
+        bp->b_bn = XFS_BUF_DADDR_NULL;
+        bp->b_flags &= ~XBF_MAPPED;
+}
 static inline struct page *
 mem_to_page(
        void                    *addr)
@@ -1402,12 +1422,12 @@ restart:
 int
 xfs_buftarg_shrink(
        struct shrinker         *shrink,
-        int                     nr_to_scan,
+        struct shrink_control   *sc)
-        gfp_t                   mask)
 {
        struct xfs_buftarg      *btp = container_of(shrink,
                                        struct xfs_buftarg, bt_shrinker);
        struct xfs_buf          *bp;
+        int nr_to_scan = sc->nr_to_scan;
        LIST_HEAD(dispose);
        if (!nr_to_scan)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a9a1c4512645..50a7d5fb3b73 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index d61611c88012..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,3 +191,32 @@ xfs_ioc_trim(
                return -XFS_ERROR(EFAULT);
        return 0;
 }
+int
+xfs_discard_extents(
+        struct xfs_mount        *mp,
+        struct list_head        *list)
+{
+        struct xfs_busy_extent  *busyp;
+        int                     error = 0;
+        list_for_each_entry(busyp, list, list) {
+                trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+                                         busyp->length);
+                error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+                                XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+                                XFS_FSB_TO_BB(mp, busyp->length),
+                                GFP_NOFS, 0);
+                if (error && error != EOPNOTSUPP) {
+                        xfs_info(mp,
+         "discard failed for extent [0x%llu,%u], error %d",
+                                 (unsigned long long)busyp->bno,
+                                 busyp->length,
+                                 error);
+                        return error;
+                }
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
index e82b6dd3e127..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,7 +2,9 @@
 #define XFS_DISCARD_H 1
 struct fstrim_range;
+struct list_head;
 extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int      xfs_discard_extents(struct xfs_mount *, struct list_head *);
 #endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index b3486dfa5520..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -586,7 +586,8 @@ xfs_file_compat_ioctl(
        case XFS_IOC_RESVSP_32:
        case XFS_IOC_UNRESVSP_32:
        case XFS_IOC_RESVSP64_32:
-        case XFS_IOC_UNRESVSP64_32: {
+        case XFS_IOC_UNRESVSP64_32:
+        case XFS_IOC_ZERO_RANGE_32: {
                struct xfs_flock64      bf;
                if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 08b605792a99..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 {
 #define XFS_IOC_UNRESVSP_32     _IOW('X', 41, struct compat_xfs_flock64)
 #define XFS_IOC_RESVSP64_32     _IOW('X', 42, struct compat_xfs_flock64)
 #define XFS_IOC_UNRESVSP64_32   _IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32   _IOW('X', 57, struct compat_xfs_flock64)
 typedef struct compat_xfs_fsop_geom_v1 {
        __u32           blocksize;      /* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 244be9cbfe78..8633521b3b2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -70,6 +70,7 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/list_sort.h>
 #include <asm/page.h>
 #include <asm/div64.h>
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 9f76cceb678d..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -41,23 +41,6 @@ __xfs_printk(
        printk("%sXFS: %pV\n", level, vaf);
 }
-void xfs_printk(
-        const char              *level,
-        const struct xfs_mount  *mp,
-        const char              *fmt, ...)
-{
-        struct va_format        vaf;
-        va_list                 args;
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        __xfs_printk(level, mp, &vaf);
-        va_end(args);
-}
 #define define_xfs_printk_level(func, kern_level)               \
 void func(const struct xfs_mount *mp, const char *fmt, ...)     \
 {                                                               \
@@ -95,8 +78,7 @@ xfs_alert_tag(
        int                     do_panic = 0;
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-                xfs_printk(KERN_ALERT, mp,
+                xfs_alert(mp, "Transforming an alert into a BUG.");
-                        "XFS: Transforming an alert into a BUG.");
                do_panic = 1;
        }
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index f1b3fc1b6c4e..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -3,9 +3,6 @@
 struct xfs_mount;
-extern void xfs_printk(const char *level, const struct xfs_mount *mp,
-                      const char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
 extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
@@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
 extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
 #else
-static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 {
 }
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b38e58d02299..98b9c91fcdf1 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
+#define MNTOPT_DELAYLOG    "delaylog"   /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
+#define MNTOPT_NODELAYLOG  "nodelaylog" /* Delayed logging disabled */
+#define MNTOPT_DISCARD     "discard"    /* Discard unused blocks */
+#define MNTOPT_NODISCARD   "nodiscard"  /* Do not discard unused blocks */
 /*
 * Table driven mount option parser.
@@ -355,6 +357,10 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+                        mp->m_flags |= XFS_MOUNT_DISCARD;
+                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+                        mp->m_flags &= ~XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, "ihashsize")) {
                        xfs_warn(mp,
        "ihashsize no longer used, option is deprecated.");
@@ -388,6 +394,13 @@ xfs_parseargs(
                return EINVAL;
        }
+        if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+            !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+                xfs_warn(mp,
+        "the discard option is incompatible with the nodelaylog option");
+                return EINVAL;
+        }
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
                xfs_warn(mp, "quota support not available in this kernel.");
@@ -488,6 +501,7 @@ xfs_showargs(
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
+                { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
@@ -1787,10 +1801,6 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
-        error = xfs_init_workqueues();
-        if (error)
-                goto out_sysctl_unregister;
        vfs_initquota();
        error = register_filesystem(&xfs_fs_type);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3e898a48122d..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -267,6 +267,16 @@ xfs_sync_inode_attr(
        error = xfs_iflush(ip, flags);
+        /*
+         * We don't want to try again on non-blocking flushes that can't run
+         * again immediately. If an inode really must be written, then that's
+         * what the SYNC_WAIT flag is for.
+         */
+        if (error == EAGAIN) {
+                ASSERT(!(flags & SYNC_WAIT));
+                error = 0;
+        }
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return error;
@@ -1022,13 +1032,14 @@ xfs_reclaim_inodes(
 static int
 xfs_reclaim_inode_shrink(
        struct shrinker *shrink,
-        int             nr_to_scan,
+        struct shrink_control *sc)
-        gfp_t           gfp_mask)
 {
        struct xfs_mount *mp;
        struct xfs_perag *pag;
        xfs_agnumber_t  ag;
        int             reclaimable;
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 2d0bcb479075..d48b7a579ae1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
 );
-#define XFS_BUSY_SYNC \
+DECLARE_EVENT_CLASS(xfs_busy_class,
-        { 0,    "async" }, \
-        { 1,    "sync" }
-TRACE_EVENT(xfs_alloc_busy,
-        TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-        TP_ARGS(trans, agno, agbno, len, sync),
-        TP_STRUCT__entry(
-                __field(dev_t, dev)
-                __field(struct xfs_trans *, tp)
-                __field(int, tid)
-                __field(xfs_agnumber_t, agno)
-                __field(xfs_agblock_t, agbno)
-                __field(xfs_extlen_t, len)
-                __field(int, sync)
-        ),
-        TP_fast_assign(
-                __entry->dev = trans->t_mountp->m_super->s_dev;
-                __entry->tp = trans;
-                __entry->tid = trans->t_ticket->t_tid;
-                __entry->agno = agno;
-                __entry->agbno = agbno;
-                __entry->len = len;
-                __entry->sync = sync;
-        ),
-        TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
-                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __entry->tp,
-                  __entry->tid,
-                  __entry->agno,
-                  __entry->agbno,
-                  __entry->len,
-                  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
-);
-TRACE_EVENT(xfs_alloc_unbusy,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
                 xfs_agblock_t agbno, xfs_extlen_t len),
        TP_ARGS(mp, agno, agbno, len),
@@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
                  __entry->agbno,
                  __entry->len)
 );
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
-#define XFS_BUSY_STATES \
+TRACE_EVENT(xfs_alloc_busy_trim,
-        { 0,    "missing" }, \
-        { 1,    "found" }
-TRACE_EVENT(xfs_alloc_busysearch,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 xfs_agblock_t agbno, xfs_extlen_t len, int found),
+                 xfs_agblock_t agbno, xfs_extlen_t len,
-        TP_ARGS(mp, agno, agbno, len, found),
+                 xfs_agblock_t tbno, xfs_extlen_t tlen),
+        TP_ARGS(mp, agno, agbno, len, tbno, tlen),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(int, found)
+                __field(xfs_agblock_t, tbno)
+                __field(xfs_extlen_t, tlen)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->found = found;
+                __entry->tbno = tbno;
+                __entry->tlen = tlen;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+        TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                  __entry->tbno,
+                  __entry->tlen)
 );
 TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
                  __entry->wasfromfl,
                  __entry->isfl,
                  __entry->userdata,
-                  __entry->firstblock)
+                  (unsigned long long)__entry->firstblock)
 )
 #define DEFINE_ALLOC_EVENT(name) \
@@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 69228aa8605a..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -60,7 +60,7 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int      xfs_qm_shake(struct shrinker *, int, gfp_t);
+STATIC int      xfs_qm_shake(struct shrinker *, struct shrink_control *);
 static struct shrinker xfs_qm_shaker = {
        .shrink = xfs_qm_shake,
@@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist(
 STATIC int
 xfs_qm_shake(
        struct shrinker *shrink,
-        int             nr_to_scan,
+        struct shrink_control *sc)
-        gfp_t           gfp_mask)
 {
        int     ndqused, nfree, n;
+        gfp_t gfp_mask = sc->gfp_mask;
        if (!kmem_shake_allow(gfp_mask))
                return 0;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 58632cc17f2d..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,7 +187,9 @@ struct xfs_busy_extent {
        xfs_agnumber_t  agno;
        xfs_agblock_t   bno;
        xfs_extlen_t    length;
-        xlog_tid_t      tid;            /* transaction that created this */
+        unsigned int    flags;
+#define XFS_ALLOC_BUSY_DISCARDED        0x01    /* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD     0x02    /* do not discard */
 };
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 27d64d752eab..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,19 +41,13 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-/*
- * Prototypes for per-ag allocation routines
- */
 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-        xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+                xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
-/*
+                xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
- * Internal functions.
- */
 /*
 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned(
        xfs_extlen_t    *reslen)        /* result length */
 {
        xfs_agblock_t   bno;
-        xfs_extlen_t    diff;
        xfs_extlen_t    len;
-        if (args->alignment > 1 && foundlen >= args->minlen) {
+        /* Trim busy sections out of found extent */
-                bno = roundup(foundbno, args->alignment);
+        xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
-                diff = bno - foundbno;
-                len = diff >= foundlen ? 0 : foundlen - diff;
+        if (args->alignment > 1 && len >= args->minlen) {
+                xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
+                xfs_extlen_t    diff = aligned_bno - bno;
+                *resbno = aligned_bno;
+                *reslen = diff >= len ? 0 : len - diff;
        } else {
-                bno = foundbno;
+                *resbno = bno;
-                len = foundlen;
+                *reslen = len;
        }
-        *resbno = bno;
-        *reslen = len;
 }
 /*
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft(
                return 1;
        agf = XFS_BUF_TO_AGF(args->agbp);
        diff = be32_to_cpu(agf->agf_freeblks)
-                + be32_to_cpu(agf->agf_flcount)
                - args->len - args->minleft;
        if (diff >= 0)
                return 1;
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent(
                if (error)
                        return error;
-                /*
+                ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
-                 * Search the busylist for these blocks and mark the
+                                              args->agbno, args->len));
-                 * transaction as synchronous if blocks are found. This
-                 * avoids the need to block due to a synchronous log
-                 * force to ensure correct ordering as the synchronous
-                 * transaction will guarantee that for us.
-                 */
-                if (xfs_alloc_busy_search(args->mp, args->agno,
-                                        args->agbno, args->len))
-                        xfs_trans_set_sync(args->tp);
        }
        if (!args->isfl) {
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
 {
        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
-        xfs_agblock_t   end;    /* end of allocated extent */
        int             error;
        xfs_agblock_t   fbno;   /* start block of found extent */
-        xfs_agblock_t   fend;   /* end block of found extent */
        xfs_extlen_t    flen;   /* length of found extent */
+        xfs_agblock_t   tbno;   /* start block of trimmed extent */
+        xfs_extlen_t    tlen;   /* length of trimmed extent */
+        xfs_agblock_t   tend;   /* end block of trimmed extent */
+        xfs_agblock_t   end;    /* end of allocated extent */
        int             i;      /* success/failure of operation */
-        xfs_agblock_t   maxend; /* end of maximal extent */
-        xfs_agblock_t   minend; /* end of minimal extent */
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
-        minend = args->agbno + args->minlen;
-        maxend = args->agbno + args->maxlen;
-        fend = fbno + flen;
        /*
-         * Give up if the freespace isn't long enough for the minimum request.
+         * Check for overlapping busy extents.
         */
-        if (fend < minend)
+        xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+        /*
+         * Give up if the start of the extent is busy, or the freespace isn't
+         * long enough for the minimum request.
+         */
+        if (tbno > args->agbno)
+                goto not_found;
+        if (tlen < args->minlen)
+                goto not_found;
+        tend = tbno + tlen;
+        if (tend < args->agbno + args->minlen)
                goto not_found;
        /*
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
         *
         * Fix the length according to mod and prod if given.
         */
-        end = XFS_AGBLOCK_MIN(fend, maxend);
+        end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
        if (!xfs_alloc_fix_minleft(args))
                goto not_found;
        rlen = args->len;
-        ASSERT(args->agbno + rlen <= fend);
+        ASSERT(args->agbno + rlen <= tend);
        end = args->agbno + rlen;
        /*
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent(
        struct xfs_btree_cur    **scur, /* searching cursor */
        xfs_agblock_t           gdiff,  /* difference for search comparison */
        xfs_agblock_t           *sbno,  /* extent found by search */
-        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slen,  /* extent length */
-        xfs_extlen_t            *slena, /* aligned length */
+        xfs_agblock_t           *sbnoa, /* aligned extent found by search */
+        xfs_extlen_t            *slena, /* aligned extent length */
        int                     dir)    /* 0 = search right, 1 = search left */
 {
-        xfs_agblock_t           bno;
        xfs_agblock_t           new;
        xfs_agblock_t           sdiff;
        int                     error;
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent(
                if (error)
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
+                xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
                /*
                 * The good extent is closer than this one.
                 */
                if (!dir) {
-                        if (bno >= args->agbno + gdiff)
+                        if (*sbnoa >= args->agbno + gdiff)
                                goto out_use_good;
                } else {
-                        if (bno <= args->agbno - gdiff)
+                        if (*sbnoa <= args->agbno - gdiff)
                                goto out_use_good;
                }
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent(
                        xfs_alloc_fix_len(args);
                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                                       args->alignment, *sbno,
+                                                       args->alignment, *sbnoa,
-                                                       *slen, &new);
+                                                       *slena, &new);
                        /*
                         * Choose closer size and invalidate other cursor.
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   gtbnoa;         /* aligned ... */
        xfs_extlen_t    gtdiff;         /* difference to right side entry */
        xfs_extlen_t    gtlen;          /* length of right side entry */
-        xfs_extlen_t    gtlena = 0;     /* aligned ... */
+        xfs_extlen_t    gtlena;         /* aligned ... */
        xfs_agblock_t   gtnew;          /* useful start bno of right side */
        int             error;          /* error code */
        int             i;              /* result code, temporary */
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltbnoa;         /* aligned ... */
        xfs_extlen_t    ltdiff;         /* difference to left side entry */
        xfs_extlen_t    ltlen;          /* length of left side entry */
-        xfs_extlen_t    ltlena = 0;     /* aligned ... */
+        xfs_extlen_t    ltlena;         /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
 #if defined(DEBUG) && defined(__KERNEL__)
        /*
         * Randomly don't execute the first algorithm.
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near(
        dofirst = random32() & 1;
 #endif
+restart:
+        bno_cur_lt = NULL;
+        bno_cur_gt = NULL;
+        ltlen = 0;
+        gtlena = 0;
+        ltlena = 0;
        /*
         * Get a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
-        ltlen = 0;
-        bno_cur_lt = bno_cur_gt = NULL;
        /*
         * See if there are any free extents as big as maxlen.
         */
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near(
                        goto error0;
                if (i == 0 || ltlen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        trace_xfs_alloc_near_noentry(args);
                        return 0;
                }
                ASSERT(i == 1);
        }
        args->wasfromfl = 0;
        /*
         * First algorithm.
         * If the requested extent is large wrt the freespaces available
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near(
                        if (args->len < blen)
                                continue;
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, ltbno, ltlen, &ltnew);
+                                args->alignment, ltbnoa, ltlena, &ltnew);
                        if (ltnew != NULLAGBLOCK &&
                            (args->len > blen || ltdiff < bdiff)) {
                                bdiff = ltdiff;
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, ltbno, ltlen, &ltnew);
+                                args->alignment, ltbnoa, ltlena, &ltnew);
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_lt, &bno_cur_gt,
-                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                ltdiff, &gtbno, &gtlen,
+                                                &gtbnoa, &gtlena,
                                                0 /* search right */);
                } else {
                        ASSERT(gtlena >= args->minlen);
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, gtbno, gtlen, &gtnew);
+                                args->alignment, gtbnoa, gtlena, &gtnew);
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_gt, &bno_cur_lt,
-                                                gtdiff, &ltbno, &ltlen, &ltlena,
+                                                gtdiff, &ltbno, &ltlen,
+                                                &ltbnoa, &ltlena,
                                                1 /* search left */);
                }
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
         * If we couldn't get anything, give up.
         */
        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+                if (!forced++) {
+                        trace_xfs_alloc_near_busy(args);
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                        goto restart;
+                }
                trace_xfs_alloc_size_neither(args);
                args->agbno = NULLAGBLOCK;
                return 0;
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
                return 0;
        }
        rlen = args->len;
-        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
+        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-                ltlen, &ltnew);
+                                     ltbnoa, ltlena, &ltnew);
        ASSERT(ltnew >= ltbno);
-        ASSERT(ltnew + rlen <= ltbno + ltlen);
+        ASSERT(ltnew + rlen <= ltbnoa + ltlena);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        args->agbno = ltnew;
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
                        ltnew, rlen, XFSA_FIXUP_BNO_OK)))
                goto error0;
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
        int             i;              /* temp status variable */
        xfs_agblock_t   rbno;           /* returned block number */
        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
+restart:
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
         */
        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
                        args->maxlen + args->alignment - 1, &i)))
                goto error0;
        /*
-         * If none, then pick up the last entry in the tree unless the
+         * If none or we have busy extents that we cannot allocate from, then
-         * tree is empty.
+         * we have to settle for a smaller extent. In the case that there are
+         * no large extents, this will return the last entry in the tree unless
+         * the tree is empty. In the case that there are only busy large
+         * extents, this will return the largest small extent unless there
+         * are no smaller extents available.
         */
-        if (!i) {
+        if (!i || forced > 1) {
-                if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
+                error = xfs_alloc_ag_vextent_small(args, cnt_cur,
-                                &flen, &i)))
+                                                   &fbno, &flen, &i);
+                if (error)
                        goto error0;
                if (i == 0 || flen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size(
                        return 0;
                }
                ASSERT(i == 1);
+                xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+        } else {
+                /*
+                 * Search for a non-busy extent that is large enough.
+                 * If we are at low space, don't check, or if we fall of
+                 * the end of the btree, turn off the busy check and
+                 * restart.
+                 */
+                for (;;) {
+                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        xfs_alloc_compute_aligned(args, fbno, flen,
+                                                  &rbno, &rlen);
+                        if (rlen >= args->maxlen)
+                                break;
+                        error = xfs_btree_increment(cnt_cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        if (i == 0) {
+                                /*
+                                 * Our only valid extents must have been busy.
+                                 * Make it unbusy by forcing the log out and
+                                 * retrying. If we've been here before, forcing
+                                 * the log isn't making the extents available,
+                                 * which means they have probably been freed in
+                                 * this transaction.  In that case, we have to
+                                 * give up on them and we'll attempt a minlen
+                                 * allocation the next time around.
+                                 */
+                                xfs_btree_del_cursor(cnt_cur,
+                                                     XFS_BTREE_NOERROR);
+                                trace_xfs_alloc_size_busy(args);
+                                if (!forced++)
+                                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                                goto restart;
+                        }
+                }
        }
-        /*
-         * There's a freespace as big as maxlen+alignment-1, get it.
-         */
-        else {
-                if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        }
        /*
         * In the first case above, we got the last entry in the
         * by-size btree.  Now we check to see if the space hits maxlen
         * once aligned; if not, we search left for something better.
         * This can't happen in the second case above.
         */
-        xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
         * Fix up the length.
         */
        args->len = rlen;
-        xfs_alloc_fix_len(args);
+        if (rlen < args->minlen) {
-        if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
+                if (!forced++) {
-                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                trace_xfs_alloc_size_nominleft(args);
+                        trace_xfs_alloc_size_busy(args);
-                args->agbno = NULLAGBLOCK;
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
-                return 0;
+                        goto restart;
+                }
+                goto out_nominleft;
        }
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args))
+                goto out_nominleft;
        rlen = args->len;
        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
        /*
@@ -1287,6 +1350,12 @@ error0:
        if (bno_cur)
                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
        return error;
+out_nominleft:
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        trace_xfs_alloc_size_nominleft(args);
+        args->agbno = NULLAGBLOCK;
+        return 0;
 }
 /*
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
                if (error)
                        goto error0;
                if (fbno != NULLAGBLOCK) {
+                        xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+                                             args->userdata);
                        if (args->userdata) {
                                xfs_buf_t       *bp;
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent(
        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
-        /*
-         * Since blocks move to the free list without the coordination
-         * used in xfs_bmap_finish, we can't allow block to be available
-         * for reallocation and non-transaction writing (user data)
-         * until we know that the transaction that moved it to the free
-         * list is permanently on disk.  We track the blocks by declaring
-         * these blocks as "busy"; the busy list is maintained on a per-ag
-         * basis and each transaction records which entries should be removed
-         * when the iclog commits to disk.  If a busy block is allocated,
-         * the iclog is pushed up to the LSN that freed the block.
-         */
-        xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 error0:
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist(
        xfs_alloc_log_agf(tp, agbp, logflags);
        *bnop = bno;
-        /*
-         * As blocks are freed, they are added to the per-ag busy list and
-         * remain there until the freeing transaction is committed to disk.
-         * Now that we have allocated blocks, this list must be searched to see
-         * if a block is being reused.  If one is, then the freeing transaction
-         * must be pushed to disk before this transaction.
-         *
-         * We do this by setting the current transaction to a sync transaction
-         * which guarantees that the freeing transaction is on disk before this
-         * transaction. This is done instead of a synchronous log force here so
-         * that we don't sit and wait with the AGF locked in the transaction
-         * during the log force.
-         */
-        if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
-                xfs_trans_set_sync(tp);
        return 0;
 }
@@ -2423,119 +2468,26 @@ xfs_free_extent(
        }
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+        if (!error)
+                xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
 error0:
        xfs_perag_put(args.pag);
        return error;
 }
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transactions have not yet hit disk.  If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_busy_clear - remove an item from the per-ag busy list
- * xfs_alloc_busy_search - search for a busy extent
- */
-/*
- * Insert a new extent into the busy tree.
- *
- * The busy extent tree is indexed by the start block of the busy extent.
- * there can be multiple overlapping ranges in the busy extent tree but only
- * ever one entry at a given start block. The reason for this is that
- * multi-block extents can be freed, then smaller chunks of that extent
- * allocated and freed again before the first transaction commit is on disk.
- * If the exact same start block is freed a second time, we have to wait for
- * that busy extent to pass out of the tree before the new extent is inserted.
- * There are two main cases we have to handle here.
- *
- * The first case is a transaction that triggers a "free - allocate - free"
- * cycle. This can occur during btree manipulations as a btree block is freed
- * to the freelist, then allocated from the free list, then freed again. In
- * this case, the second extxpnet free is what triggers the duplicate and as
- * such the transaction IDs should match. Because the extent was allocated in
- * this transaction, the transaction must be marked as synchronous. This is
- * true for all cases where the free/alloc/free occurs in the one transaction,
- * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
- * This serves to catch violations of the second case quite effectively.
- *
- * The second case is where the free/alloc/free occur in different
- * transactions. In this case, the thread freeing the extent the second time
- * can't mark the extent busy immediately because it is already tracked in a
- * transaction that may be committing.  When the log commit for the existing
- * busy extent completes, the busy extent will be removed from the tree. If we
- * allow the second busy insert to continue using that busy extent structure,
- * it can be freed before this transaction is safely in the log.  Hence our
- * only option in this case is to force the log to remove the existing busy
- * extent from the list before we insert the new one with the current
- * transaction ID.
- *
- * The problem we are trying to avoid in the free-alloc-free in separate
- * transactions is most easily described with a timeline:
- *
- *      Thread 1        Thread 2        Thread 3        xfslogd
- *      xact alloc
- *      free X
- *      mark busy
- *      commit xact
- *      free xact
- *                      xact alloc
- *                      alloc X
- *                      busy search
- *                      mark xact sync
- *                      commit xact
- *                      free xact
- *                      force log
- *                      checkpoint starts
- *                      ....
- *                                      xact alloc
- *                                      free X
- *                                      mark busy
- *                                      finds match
- *                                      *** KABOOM! ***
- *                                      ....
- *                                                      log IO completes
- *                                                      unbusy X
- *                      checkpoint completes
- *
- * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
- * the checkpoint completes, and the busy extent it matched will have been
- * removed from the tree when it is woken. Hence it can then continue safely.
- *
- * However, to ensure this matching process is robust, we need to use the
- * transaction ID for identifying transaction, as delayed logging results in
- * the busy extent and transaction lifecycles being different. i.e. the busy
- * extent is active for a lot longer than the transaction.  Hence the
- * transaction structure can be freed and reallocated, then mark the same
- * extent busy again in the new transaction. In this case the new transaction
- * will have a different tid but can have the same address, and hence we need
- * to check against the tid.
- *
- * Future: for delayed logging, we could avoid the log force if the extent was
- * first freed in the current checkpoint sequence. This, however, requires the
- * ability to pin the current checkpoint in memory until this transaction
- * commits to ensure that both the original free and the current one combine
- * logically into the one checkpoint. If the checkpoint sequences are
- * different, however, we still need to wait on a log force.
- */
 void
 xfs_alloc_busy_insert(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
        xfs_agblock_t           bno,
-        xfs_extlen_t            len)
+        xfs_extlen_t            len,
+        unsigned int            flags)
 {
        struct xfs_busy_extent  *new;
        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
        struct rb_node          **rbp;
-        struct rb_node          *parent;
+        struct rb_node          *parent = NULL;
-        int                     match;
        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
        if (!new) {
@@ -2544,7 +2496,7 @@ xfs_alloc_busy_insert(
                 * block, make this a synchronous transaction to insure that
                 * the block is not reused before this transaction commits.
                 */
-                trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+                trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
                xfs_trans_set_sync(tp);
                return;
        }
@@ -2552,66 +2504,29 @@ xfs_alloc_busy_insert(
        new->agno = agno;
        new->bno = bno;
        new->length = len;
-        new->tid = xfs_log_get_trans_ident(tp);
        INIT_LIST_HEAD(&new->list);
+        new->flags = flags;
        /* trace before insert to be able to see failed inserts */
-        trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
        pag = xfs_perag_get(tp->t_mountp, new->agno);
-restart:
        spin_lock(&pag->pagb_lock);
        rbp = &pag->pagb_tree.rb_node;
-        parent = NULL;
+        while (*rbp) {
-        busyp = NULL;
-        match = 0;
-        while (*rbp && match >= 0) {
                parent = *rbp;
                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
                if (new->bno < busyp->bno) {
-                        /* may overlap, but exact start block is lower */
                        rbp = &(*rbp)->rb_left;
-                        if (new->bno + new->length > busyp->bno)
+                        ASSERT(new->bno + new->length <= busyp->bno);
-                                match = busyp->tid == new->tid ? 1 : -1;
                } else if (new->bno > busyp->bno) {
-                        /* may overlap, but exact start block is higher */
                        rbp = &(*rbp)->rb_right;
-                        if (bno < busyp->bno + busyp->length)
+                        ASSERT(bno >= busyp->bno + busyp->length);
-                                match = busyp->tid == new->tid ? 1 : -1;
                } else {
-                        match = busyp->tid == new->tid ? 1 : -1;
+                        ASSERT(0);
-                        break;
                }
        }
-        if (match < 0) {
-                /* overlap marked busy in different transaction */
-                spin_unlock(&pag->pagb_lock);
-                xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
-                goto restart;
-        }
-        if (match > 0) {
-                /*
-                 * overlap marked busy in same transaction. Update if exact
-                 * start block match, otherwise combine the busy extents into
-                 * a single range.
-                 */
-                if (busyp->bno == new->bno) {
-                        busyp->length = max(busyp->length, new->length);
-                        spin_unlock(&pag->pagb_lock);
-                        ASSERT(tp->t_flags & XFS_TRANS_SYNC);
-                        xfs_perag_put(pag);
-                        kmem_free(new);
-                        return;
-                }
-                rb_erase(&busyp->rb_node, &pag->pagb_tree);
-                new->length = max(busyp->bno + busyp->length,
-                                        new->bno + new->length) -
-                                min(busyp->bno, new->bno);
-                new->bno = min(busyp->bno, new->bno);
-        } else
-                busyp = NULL;
        rb_link_node(&new->rb_node, parent, rbp);
        rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2619,7 +2534,6 @@ restart:
        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-        kmem_free(busyp);
 }
 /*
@@ -2668,31 +2582,466 @@ xfs_alloc_busy_search(
                }
        }
        spin_unlock(&pag->pagb_lock);
-        trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
        return match;
 }
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        struct xfs_busy_extent  *busyp,
+        xfs_agblock_t           fbno,
+        xfs_extlen_t            flen,
+        bool                    userdata)
+{
+        xfs_agblock_t           fend = fbno + flen;
+        xfs_agblock_t           bbno = busyp->bno;
+        xfs_agblock_t           bend = bbno + busyp->length;
+        /*
+         * This extent is currently being discarded.  Give the thread
+         * performing the discard a chance to mark the extent unbusy
+         * and retry.
+         */
+        if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+                spin_unlock(&pag->pagb_lock);
+                delay(1);
+                spin_lock(&pag->pagb_lock);
+                return false;
+        }
+        /*
+         * If there is a busy extent overlapping a user allocation, we have
+         * no choice but to force the log and retry the search.
+         *
+         * Fortunately this does not happen during normal operation, but
+         * only if the filesystem is very low on space and has to dip into
+         * the AGFL for normal allocations.
+         */
+        if (userdata)
+                goto out_force_log;
+        if (bbno < fbno && bend > fend) {
+                /*
+                 * Case 1:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +---------+
+                 *        fbno   fend
+                 */
+                /*
+                 * We would have to split the busy extent to be able to track
+                 * it correct, which we cannot do because we would have to
+                 * modify the list of busy extents attached to the transaction
+                 * or CIL context, which is immutable.
+                 *
+                 * Force out the log to clear the busy extent and retry the
+                 * search.
+                 */
+                goto out_force_log;
+        } else if (bbno >= fbno && bend <= fend) {
+                /*
+                 * Case 2:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *    +-----------------+
+                 *    fbno           fend
+                 *
+                 * Case 3:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *    +--------------------------+
+                 *    fbno                    fend
+                 *
+                 * Case 4:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +--------------------------+
+                 *    fbno                    fend
+                 *
+                 * Case 5:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +-----------------------------------+
+                 *    fbno                             fend
+                 *
+                 */
+                /*
+                 * The busy extent is fully covered by the extent we are
+                 * allocating, and can simply be removed from the rbtree.
+                 * However we cannot remove it from the immutable list
+                 * tracking busy extents in the transaction or CIL context,
+                 * so set the length to zero to mark it invalid.
+                 *
+                 * We also need to restart the busy extent search from the
+                 * tree root, because erasing the node can rearrange the
+                 * tree topology.
+                 */
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                busyp->length = 0;
+                return false;
+        } else if (fend < bend) {
+                /*
+                 * Case 6:
+                 *              bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *             +---------+
+                 *             fbno   fend
+                 *
+                 * Case 7:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +------------------+
+                 *    fbno            fend
+                 *
+                 */
+                busyp->bno = fend;
+        } else if (bbno < fbno) {
+                /*
+                 * Case 8:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +-------------+
+                 *        fbno       fend
+                 *
+                 * Case 9:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +----------------------+
+                 *        fbno                fend
+                 */
+                busyp->length = fbno - busyp->bno;
+        } else {
+                ASSERT(0);
+        }
+        trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+        return true;
+out_force_log:
+        spin_unlock(&pag->pagb_lock);
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+        spin_lock(&pag->pagb_lock);
+        return false;
+}
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
 void
-xfs_alloc_busy_clear(
+xfs_alloc_busy_reuse(
        struct xfs_mount        *mp,
-        struct xfs_busy_extent  *busyp)
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           fbno,
+        xfs_extlen_t            flen,
+        bool                    userdata)
 {
        struct xfs_perag        *pag;
+        struct rb_node          *rbp;
-        trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+        ASSERT(flen > 0);
-                                                busyp->length);
-        ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+        pag = xfs_perag_get(mp, agno);
-                                                busyp->length) == 1);
+        spin_lock(&pag->pagb_lock);
+restart:
+        rbp = pag->pagb_tree.rb_node;
+        while (rbp) {
+                struct xfs_busy_extent *busyp =
+                        rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                xfs_agblock_t   bbno = busyp->bno;
+                xfs_agblock_t   bend = bbno + busyp->length;
-        list_del_init(&busyp->list);
+                if (fbno + flen <= bbno) {
+                        rbp = rbp->rb_left;
+                        continue;
+                } else if (fbno >= bend) {
+                        rbp = rbp->rb_right;
+                        continue;
+                }
-        pag = xfs_perag_get(mp, busyp->agno);
+                if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
-        spin_lock(&pag->pagb_lock);
+                                                  userdata))
-        rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                        goto restart;
+        }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+}
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+        struct xfs_alloc_arg    *args,
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len,
+        xfs_agblock_t           *rbno,
+        xfs_extlen_t            *rlen)
+{
+        xfs_agblock_t           fbno;
+        xfs_extlen_t            flen;
+        struct rb_node          *rbp;
+        ASSERT(len > 0);
+        spin_lock(&args->pag->pagb_lock);
+restart:
+        fbno = bno;
+        flen = len;
+        rbp = args->pag->pagb_tree.rb_node;
+        while (rbp && flen >= args->minlen) {
+                struct xfs_busy_extent *busyp =
+                        rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                xfs_agblock_t   fend = fbno + flen;
+                xfs_agblock_t   bbno = busyp->bno;
+                xfs_agblock_t   bend = bbno + busyp->length;
+                if (fend <= bbno) {
+                        rbp = rbp->rb_left;
+                        continue;
+                } else if (fbno >= bend) {
+                        rbp = rbp->rb_right;
+                        continue;
+                }
+                /*
+                 * If this is a metadata allocation, try to reuse the busy
+                 * extent instead of trimming the allocation.
+                 */
+                if (!args->userdata &&
+                    !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
+                        if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+                                                          busyp, fbno, flen,
+                                                          false))
+                                goto restart;
+                        continue;
+                }
+                if (bbno <= fbno) {
+                        /* start overlap */
+                        /*
+                         * Case 1:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +---------+
+                         *        fbno   fend
+                         *
+                         * Case 2:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +-------------+
+                         *    fbno       fend
+                         *
+                         * Case 3:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +-------------+
+                         *        fbno       fend
+                         *
+                         * Case 4:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +-----------------+
+                         *    fbno           fend
+                         *
+                         * No unbusy region in extent, return failure.
+                         */
+                        if (fend <= bend)
+                                goto fail;
+                        /*
+                         * Case 5:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +----------------------+
+                         *        fbno                fend
+                         *
+                         * Case 6:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +--------------------------+
+                         *    fbno                    fend
+                         *
+                         * Needs to be trimmed to:
+                         *                       +-------+
+                         *                       fbno fend
+                         */
+                        fbno = bend;
+                } else if (bend >= fend) {
+                        /* end overlap */
+                        /*
+                         * Case 7:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +------------------+
+                         *    fbno            fend
+                         *
+                         * Case 8:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +--------------------------+
+                         *    fbno                    fend
+                         *
+                         * Needs to be trimmed to:
+                         *    +-------+
+                         *    fbno fend
+                         */
+                        fend = bbno;
+                } else {
+                        /* middle overlap */
+                        /*
+                         * Case 9:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +-----------------------------------+
+                         *    fbno                             fend
+                         *
+                         * Can be trimmed to:
+                         *    +-------+        OR         +-------+
+                         *    fbno fend                   fbno fend
+                         *
+                         * Backward allocation leads to significant
+                         * fragmentation of directories, which degrades
+                         * directory performance, therefore we always want to
+                         * choose the option that produces forward allocation
+                         * patterns.
+                         * Preferring the lower bno extent will make the next
+                         * request use "fend" as the start of the next
+                         * allocation;  if the segment is no longer busy at
+                         * that point, we'll get a contiguous allocation, but
+                         * even if it is still busy, we will get a forward
+                         * allocation.
+                         * We try to avoid choosing the segment at "bend",
+                         * because that can lead to the next allocation
+                         * taking the segment at "fbno", which would be a
+                         * backward allocation.  We only use the segment at
+                         * "fbno" if it is much larger than the current
+                         * requested size, because in that case there's a
+                         * good chance subsequent allocations will be
+                         * contiguous.
+                         */
+                        if (bbno - fbno >= args->maxlen) {
+                                /* left candidate fits perfect */
+                                fend = bbno;
+                        } else if (fend - bend >= args->maxlen * 4) {
+                                /* right candidate has enough free space */
+                                fbno = bend;
+                        } else if (bbno - fbno >= args->minlen) {
+                                /* left candidate fits minimum requirement */
+                                fend = bbno;
+                        } else {
+                                goto fail;
+                        }
+                }
+                flen = fend - fbno;
+        }
+        spin_unlock(&args->pag->pagb_lock);
+        if (fbno != bno || flen != len) {
+                trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+                                          fbno, flen);
+        }
+        *rbno = fbno;
+        *rlen = flen;
+        return;
+fail:
+        /*
+         * Return a zero extent length as failure indications.  All callers
+         * re-check if the trimmed extent satisfies the minlen requirement.
+         */
+        spin_unlock(&args->pag->pagb_lock);
+        trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+        *rbno = fbno;
+        *rlen = 0;
+}
+static void
+xfs_alloc_busy_clear_one(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        struct xfs_busy_extent  *busyp)
+{
+        if (busyp->length) {
+                trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+                                                busyp->length);
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+        }
+        list_del_init(&busyp->list);
        kmem_free(busyp);
 }
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_alloc_busy_clear(
+        struct xfs_mount        *mp,
+        struct list_head        *list,
+        bool                    do_discard)
+{
+        struct xfs_busy_extent  *busyp, *n;
+        struct xfs_perag        *pag = NULL;
+        xfs_agnumber_t          agno = NULLAGNUMBER;
+        list_for_each_entry_safe(busyp, n, list, list) {
+                if (busyp->agno != agno) {
+                        if (pag) {
+                                spin_unlock(&pag->pagb_lock);
+                                xfs_perag_put(pag);
+                        }
+                        pag = xfs_perag_get(mp, busyp->agno);
+                        spin_lock(&pag->pagb_lock);
+                        agno = busyp->agno;
+                }
+                if (do_discard && busyp->length &&
+                    !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+                        busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+                else
+                        xfs_alloc_busy_clear_one(mp, pag, busyp);
+        }
+        if (pag) {
+                spin_unlock(&pag->pagb_lock);
+                xfs_perag_put(pag);
+        }
+}
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+        void                    *priv,
+        struct list_head        *a,
+        struct list_head        *b)
+{
+        return container_of(a, struct xfs_busy_extent, list)->agno -
+                container_of(b, struct xfs_busy_extent, list)->agno;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index d0b3bc72005b..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -137,14 +137,28 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
 xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-        xfs_agblock_t bno, xfs_extlen_t len);
+        xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
+        bool do_discard);
 int
 xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
        xfs_agblock_t bno, xfs_extlen_t len);
+void
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+        list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
 #endif  /* __KERNEL__ */
 /*
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3916925e2584..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
                return 0;
        }
+        xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
        xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
@@ -118,18 +120,8 @@ xfs_allocbt_free_block(
        if (error)
                return error;
-        /*
+        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
-         * Since blocks move to the free list without the coordination used in
+                              XFS_ALLOC_BUSY_SKIP_DISCARD);
-         * xfs_bmap_finish, we can't allow block to be available for
-         * reallocation and non-transaction writing (user data) until we know
-         * that the transaction that moved it to the free list is permanently
-         * on disk. We track the blocks by declaring these blocks as "busy";
-         * the busy list is maintained on a per-ag basis and each transaction
-         * records which entries should be removed when the iclog commits to
-         * disk. If a busy block is allocated, the iclog is pushed up to the
-         * LSN that freed the block.
-         */
-        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fa00788de2f5..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
        int                     *flags);        /* inode logging flags */
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
-        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        xfs_fsblock_t           *first, /* pointer to firstblock variable */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
-        int                     rsvd);  /* OK to allocate reserved blocks */
-/*
 * Called by xfs_bmap_add_extent to handle cases converting a delayed
 * allocation to a real allocation.
 */
 STATIC int                              /* error */
 xfs_bmap_add_extent_delay_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_delay(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp,/* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
 STATIC int                              /* error */
 xfs_bmap_add_extent_unwritten_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
        int                     whichfork); /* data or attr fork */
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int                              /* error */
-xfs_bmap_del_extent(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_trans_t             *tp,    /* current trans pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        xfs_btree_cur_t         *cur,   /* if null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp,/* inode logging flags */
-        int                     whichfork, /* data or attr fork */
-        int                     rsvd);   /* OK to allocate reserved blocks */
-/*
 * Remove the entry "free" from the free item list.  Prev points to the
 * previous entry, unless "free" is the head of the list.
 */
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
 STATIC int                              /* error */
 xfs_bmap_add_extent(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
+        int                     whichfork) /* data or attr fork */
-        int                     rsvd)   /* OK to use reserved data blocks */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor or null */
        xfs_filblks_t           da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
        xfs_extnum_t            nextents; /* number of extents in file now */
        XFS_STATS_INC(xs_add_exlist);
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-        ASSERT(idx <= nextents);
        da_old = da_new = 0;
        error = 0;
+        ASSERT(*idx >= 0);
+        ASSERT(*idx <= nextents);
        /*
         * This is the first extent added to a new/empty file.
         * Special case this one, so other routines get to assume there are
         * already extents in the list.
         */
        if (nextents == 0) {
-                xfs_iext_insert(ip, 0, 1, new,
+                xfs_iext_insert(ip, *idx, 1, new,
                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
                ASSERT(cur == NULL);
-                ifp->if_lastex = 0;
                if (!isnullstartblock(new->br_startblock)) {
                        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
                        logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
                if (cur)
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+                error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-                                &logflags, rsvd)))
+                                                       &logflags);
-                        goto done;
        }
        /*
         * Real allocation off the end of the file.
         */
-        else if (idx == nextents) {
+        else if (*idx == nextents) {
                if (cur)
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+                error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-                                &logflags, whichfork)))
+                                &logflags, whichfork);
-                        goto done;
        } else {
                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
                /*
                 * Get the record referred to by idx.
                 */
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
                /*
                 * If it's a real allocation record, and the new allocation ends
                 * after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
                                if (cur)
                                        ASSERT(cur->bc_private.b.flags &
                                                XFS_BTCUR_BPRV_WASDEL);
-                                if ((error = xfs_bmap_add_extent_delay_real(ip,
+                                error = xfs_bmap_add_extent_delay_real(ip,
-                                        idx, &cur, new, &da_new, first, flist,
+                                                idx, &cur, new, &da_new,
-                                        &logflags, rsvd)))
+                                                first, flist, &logflags);
-                                        goto done;
-                        } else if (new->br_state == XFS_EXT_NORM) {
-                                ASSERT(new->br_state == XFS_EXT_NORM);
-                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags)))
-                                        goto done;
                        } else {
-                                ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
+                                ASSERT(new->br_state == XFS_EXT_NORM ||
-                                if ((error = xfs_bmap_add_extent_unwritten_real(
+                                       new->br_state == XFS_EXT_UNWRITTEN);
-                                        ip, idx, &cur, new, &logflags)))
+                                error = xfs_bmap_add_extent_unwritten_real(ip,
+                                                idx, &cur, new, &logflags);
+                                if (error)
                                        goto done;
                        }
-                        ASSERT(*curp == cur || *curp == NULL);
                }
                /*
                 * Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
                        if (cur)
                                ASSERT((cur->bc_private.b.flags &
                                        XFS_BTCUR_BPRV_WASDEL) == 0);
-                        if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+                        error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-                                        new, &logflags, whichfork)))
+                                        new, &logflags, whichfork);
-                                goto done;
                }
        }
+        if (error)
+                goto done;
        ASSERT(*curp == cur || *curp == NULL);
        /*
         * Convert to a btree if necessary.
         */
@@ -615,7 +580,7 @@ xfs_bmap_add_extent(
                ASSERT(nblks <= da_old);
                if (nblks < da_old)
                        xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                                (int64_t)(da_old - nblks), rsvd);
+                                (int64_t)(da_old - nblks), 0);
        }
        /*
         * Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_delay_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        int                     rsvd)   /* OK to use reserved data block allocation */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
        int                     diff;   /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
         */
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        new_endoff = new->br_startoff + new->br_blockcount;
        ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
                if (isnullstartblock(LEFT.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
                if (isnullstartblock(RIGHT.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left and right neighbors are both contiguous with new.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 2, state);
+                xfs_iext_remove(ip, *idx + 1, 2, state);
-                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left neighbor is contiguous, the right is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The right neighbor is contiguous, the left is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx + 1, 1, state);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, PREV.br_state)))
                                goto done;
                }
                *dnew = 0;
                break;
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                *dnew = 0;
                break;
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                --*idx;
                *dnew = temp;
                break;
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, new_endoff);
                temp = PREV.br_blockcount - new->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, idx + 1);
+                ep = xfs_iext_get_ext(ifp, *idx + 1);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
                *dnew = temp;
                break;
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is contiguous with the new allocation.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount,
                        RIGHT.br_state);
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_state)))
                                goto done;
                }
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp;
                break;
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is not contiguous.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, idx + 1, 1, new, state);
+                xfs_iext_insert(ip, *idx + 1, 1, new, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, idx);
+                ep = xfs_iext_get_ext(ifp, *idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp;
                break;
@@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real(
                 */
                temp = new->br_startoff - PREV.br_startoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
                LEFT = *new;
                RIGHT.br_state = PREV.br_state;
@@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real(
                RIGHT.br_startoff = new_endoff;
                RIGHT.br_blockcount = temp2;
                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-                xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
+                xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real(
                        (cur ? cur->bc_private.b.allocated : 0));
                if (diff > 0 &&
                    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                                             -((int64_t)diff), rsvd)) {
+                                             -((int64_t)diff), 0)) {
                        /*
                         * Ick gross gag me with a spoon.
                         */
@@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real(
                                        if (!diff ||
                                            !xfs_icsb_modify_counters(ip->i_mount,
                                                    XFS_SBS_FDBLOCKS,
-                                                    -((int64_t)diff), rsvd))
+                                                    -((int64_t)diff), 0))
                                                break;
                                }
                                if (temp2) {
@@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real(
                                        if (!diff ||
                                            !xfs_icsb_modify_counters(ip->i_mount,
                                                    XFS_SBS_FDBLOCKS,
-                                                    -((int64_t)diff), rsvd))
+                                                    -((int64_t)diff), 0))
                                                break;
                                }
                        }
                }
-                ep = xfs_iext_get_ext(ifp, idx);
+                ep = xfs_iext_get_ext(ifp, *idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
                        nullstartblock((int)temp2));
-                trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp + temp2;
                break;
@@ -1161,7 +1131,7 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_unwritten_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp) /* inode logging flags */
@@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
        error = 0;
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        newext = new->br_state;
        oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
                if (isnullstartblock(LEFT.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
                if (isnullstartblock(RIGHT.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
        }
@@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The left and right neighbors are both contiguous with new.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 2, state);
+                xfs_iext_remove(ip, *idx + 1, 2, state);
-                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents -= 2;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The left neighbor is contiguous, the right is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The right neighbor is contiguous, the left is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
                xfs_bmbt_set_state(ep, newext);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx + 1, 1, state);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_state(ep, newext);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                --*idx;
-                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
                xfs_bmbt_set_startoff(ep, new_endoff);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is contiguous with the new allocation.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+                ++*idx;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount, newext);
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                xfs_iext_insert(ip, idx + 1, 1, new, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
                 * newext.  Contiguity is impossible here.
                 * One extent becomes three extents.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        new->br_startoff - PREV.br_startoff);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                r[0] = *new;
                r[1].br_startoff = new_endoff;
@@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
                        PREV.br_startoff + PREV.br_blockcount - new_endoff;
                r[1].br_startblock = new->br_startblock + new->br_blockcount;
                r[1].br_state = oldext;
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
-                ip->i_df.if_lastex = idx + 1;
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 2, &r[0], state);
                ip->i_d.di_nextents += 2;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1650,12 +1625,10 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_delay(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        int                     rsvd)           /* OK to allocate reserved blocks */
 {
-        xfs_bmbt_rec_host_t     *ep;    /* extent record for idx */
        xfs_ifork_t             *ifp;   /* inode fork pointer */
        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
        xfs_filblks_t           newlen=0;       /* new indirect size */
@@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
        xfs_filblks_t           temp=0; /* temp for indirect calculations */
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
        ASSERT(isnullstartblock(new->br_startblock));
        /*
         * Check and set flags if this segment has a left neighbor
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
                if (isnullstartblock(left.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
         * Check and set flags if the current (right) segment exists.
         * If it doesn't exist, we're converting the hole at end-of-file.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(ep, &right);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
                if (isnullstartblock(right.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
@@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
                 * on the left and on the right.
                 * Merge all three into a single extent record.
                 */
+                --*idx;
                temp = left.br_blockcount + new->br_blockcount +
                        right.br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock) +
                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
                        nullstartblock((int)newlen));
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 1, state);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                ip->i_df.if_lastex = idx - 1;
                break;
        case BMAP_LEFT_CONTIG:
@@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
+                --*idx;
                temp = left.br_blockcount + new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
                        nullstartblock((int)newlen));
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
                break;
        case BMAP_RIGHT_CONTIG:
@@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                temp = new->br_blockcount + right.br_blockcount;
                oldlen = startblockval(new->br_startblock) +
                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_allf(ep, new->br_startoff,
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff,
                        nullstartblock((int)newlen), temp, right.br_state);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                break;
        case 0:
@@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
                 * Insert a new entry.
                 */
                oldlen = newlen = 0;
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                break;
        }
        if (oldlen != newlen) {
                ASSERT(oldlen > newlen);
                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                        (int64_t)(oldlen - newlen), rsvd);
+                        (int64_t)(oldlen - newlen), 0);
                /*
                 * Nothing to do for disk quota accounting here.
                 */
@@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
        int                     whichfork) /* data or attr fork */
 {
-        xfs_bmbt_rec_host_t     *ep;    /* pointer to extent entry ins. point */
        int                     error;  /* error return value */
        int                     i;      /* temp state */
        xfs_ifork_t             *ifp;   /* inode fork pointer */
@@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
        int                     state;  /* state bits, accessed thru macros */
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+        ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
-        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
        if (whichfork == XFS_ATTR_FORK)
@@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
        /*
         * Check and set flags if this segment has a left neighbor.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
                if (isnullstartblock(left.br_startblock))
                        state |= BMAP_LEFT_DELAY;
        }
@@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
         * Check and set flags if this segment has a current value.
         * Not true if we're inserting into the "hole" at eof.
         */
-        if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+        if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(ep, &right);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
                if (isnullstartblock(right.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
        }
@@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
                 * left and on the right.
                 * Merge all three into a single extent record.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        left.br_blockcount + new->br_blockcount +
                        right.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
-                ifp->if_lastex = idx - 1;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                if (cur == NULL) {
@@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        left.br_blockcount + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ifp->if_lastex = idx - 1;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
@@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + right.br_blockcount,
                        right.br_state);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ifp->if_lastex = idx;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
@@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
                 * real allocation.
                 * Insert a new entry.
                 */
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ifp->if_lastex = idx;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
                if (cur == NULL) {
@@ -2833,13 +2801,12 @@ STATIC int				/* error */
 xfs_bmap_del_extent(
        xfs_inode_t             *ip,    /* incore inode pointer */
        xfs_trans_t             *tp,    /* current transaction pointer */
-        xfs_extnum_t            idx,    /* extent number to update/delete */
+        xfs_extnum_t            *idx,   /* extent number to update/delete */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
+        int                     whichfork) /* data or attr fork */
-        int                     rsvd)   /* OK to allocate reserved blocks */
 {
        xfs_filblks_t           da_new; /* new delay-alloc indirect blocks */
        xfs_filblks_t           da_old; /* old delay-alloc indirect blocks */
@@ -2870,10 +2837,10 @@ xfs_bmap_del_extent(
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT((idx >= 0) && (idx < ifp->if_bytes /
+        ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
                (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(del->br_blockcount > 0);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &got);
        ASSERT(got.br_startoff <= del->br_startoff);
        del_endoff = del->br_startoff + del->br_blockcount;
@@ -2947,11 +2914,12 @@ xfs_bmap_del_extent(
                /*
                 * Matches the whole extent.  Delete the entry.
                 */
-                xfs_iext_remove(ip, idx, 1,
+                xfs_iext_remove(ip, *idx, 1,
                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-                ifp->if_lastex = idx;
+                --*idx;
                if (delay)
                        break;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                flags |= XFS_ILOG_CORE;
@@ -2968,21 +2936,20 @@ xfs_bmap_del_extent(
                /*
                 * Deleting the first part of the extent.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, del_endoff);
                temp = got.br_blockcount - del->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                ifp->if_lastex = idx;
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                        da_new = temp;
                        break;
                }
                xfs_bmbt_set_startblock(ep, del_endblock);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -2998,18 +2965,17 @@ xfs_bmap_del_extent(
                 * Deleting the last part of the extent.
                 */
                temp = got.br_blockcount - del->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                ifp->if_lastex = idx;
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                        da_new = temp;
                        break;
                }
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -3026,7 +2992,7 @@ xfs_bmap_del_extent(
                 * Deleting the middle of the extent.
                 */
                temp = del->br_startoff - got.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                new.br_startoff = del_endoff;
                temp2 = got_endoff - del_endoff;
@@ -3113,9 +3079,9 @@ xfs_bmap_del_extent(
                                }
                        }
                }
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 1, &new, state);
+                xfs_iext_insert(ip, *idx + 1, 1, &new, state);
-                ifp->if_lastex = idx + 1;
+                ++*idx;
                break;
        }
        /*
@@ -3142,7 +3108,7 @@ xfs_bmap_del_extent(
        ASSERT(da_old >= da_new);
        if (da_old > da_new) {
                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                        (int64_t)(da_old - da_new), rsvd);
+                        (int64_t)(da_old - da_new), 0);
        }
 done:
        *logflagsp = flags;
@@ -4562,29 +4528,24 @@ xfs_bmapi(
                                if (rt) {
                                        error = xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FREXTENTS,
-                                                        -((int64_t)extsz), (flags &
+                                                        -((int64_t)extsz), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                } else {
                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)alen), (flags &
+                                                        -((int64_t)alen), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (!error) {
                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)indlen), (flags &
+                                                        -((int64_t)indlen), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                        if (error && rt)
                                                xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FREXTENTS,
-                                                        (int64_t)extsz, (flags &
+                                                        (int64_t)extsz, 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                        else if (error)
                                                xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        (int64_t)alen, (flags &
+                                                        (int64_t)alen, 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (error) {
@@ -4701,13 +4662,12 @@ xfs_bmapi(
                                if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
                                        got.br_state = XFS_EXT_UNWRITTEN;
                        }
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
                                firstblock, flist, &tmp_logflags,
-                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork);
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
-                        lastx = ifp->if_lastex;
                        ep = xfs_iext_get_ext(ifp, lastx);
                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
                        xfs_bmbt_get_all(ep, &got);
@@ -4803,13 +4763,12 @@ xfs_bmapi(
                        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
                                                ? XFS_EXT_NORM
                                                : XFS_EXT_UNWRITTEN;
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
                                firstblock, flist, &tmp_logflags,
-                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork);
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
-                        lastx = ifp->if_lastex;
                        ep = xfs_iext_get_ext(ifp, lastx);
                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
                        xfs_bmbt_get_all(ep, &got);
@@ -4868,14 +4827,14 @@ xfs_bmapi(
                /*
                 * Else go on to the next record.
                 */
-                ep = xfs_iext_get_ext(ifp, ++lastx);
                prev = got;
-                if (lastx >= nextents)
+                if (++lastx < nextents) {
-                        eof = 1;
+                        ep = xfs_iext_get_ext(ifp, lastx);
-                else
                        xfs_bmbt_get_all(ep, &got);
+                } else {
+                        eof = 1;
+                }
        }
-        ifp->if_lastex = lastx;
        *nmap = n;
        /*
         * Transform from btree to extents, give it cur.
@@ -4984,7 +4943,6 @@ xfs_bmapi_single(
        ASSERT(!isnullstartblock(got.br_startblock));
        ASSERT(bno < got.br_startoff + got.br_blockcount);
        *fsb = got.br_startblock + (bno - got.br_startoff);
-        ifp->if_lastex = lastx;
        return 0;
 }
@@ -5026,7 +4984,6 @@ xfs_bunmapi(
        int                     tmp_logflags;   /* partial logging flags */
        int                     wasdel;         /* was a delayed alloc extent */
        int                     whichfork;      /* data or attribute fork */
-        int                     rsvd;           /* OK to allocate reserved blocks */
        xfs_fsblock_t           sum;
        trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5044,7 +5001,7 @@ xfs_bunmapi(
        mp = ip->i_mount;
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
        ASSERT(ifp->if_ext_max ==
@@ -5160,9 +5117,9 @@ xfs_bunmapi(
                                del.br_blockcount = mod;
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
                                firstblock, flist, &logflags,
-                                XFS_DATA_FORK, 0);
+                                XFS_DATA_FORK);
                        if (error)
                                goto error0;
                        goto nodelete;
@@ -5188,9 +5145,12 @@ xfs_bunmapi(
                                 */
                                ASSERT(bno >= del.br_blockcount);
                                bno -= del.br_blockcount;
-                                if (bno < got.br_startoff) {
+                                if (got.br_startoff > bno) {
-                                        if (--lastx >= 0)
+                                        if (--lastx >= 0) {
-                                                xfs_bmbt_get_all(--ep, &got);
+                                                ep = xfs_iext_get_ext(ifp,
+                                                                      lastx);
+                                                xfs_bmbt_get_all(ep, &got);
+                                        }
                                }
                                continue;
                        } else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5214,18 +5174,19 @@ xfs_bunmapi(
                                        prev.br_startoff = start;
                                }
                                prev.br_state = XFS_EXT_UNWRITTEN;
-                                error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+                                lastx--;
+                                error = xfs_bmap_add_extent(ip, &lastx, &cur,
                                        &prev, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK);
                                if (error)
                                        goto error0;
                                goto nodelete;
                        } else {
                                ASSERT(del.br_state == XFS_EXT_NORM);
                                del.br_state = XFS_EXT_UNWRITTEN;
-                                error = xfs_bmap_add_extent(ip, lastx, &cur,
+                                error = xfs_bmap_add_extent(ip, &lastx, &cur,
                                        &del, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5240,13 +5201,13 @@ xfs_bunmapi(
                                rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
                                do_div(rtexts, mp->m_sb.sb_rextsize);
                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-                                                (int64_t)rtexts, rsvd);
+                                                (int64_t)rtexts, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                                (int64_t)del.br_blockcount, rsvd);
+                                                (int64_t)del.br_blockcount, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
@@ -5277,31 +5238,29 @@ xfs_bunmapi(
                        error = XFS_ERROR(ENOSPC);
                        goto error0;
                }
-                error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
+                error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
-                                &tmp_logflags, whichfork, rsvd);
+                                &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
                if (error)
                        goto error0;
                bno = del.br_startoff - 1;
 nodelete:
-                lastx = ifp->if_lastex;
                /*
                 * If not done go on to the next (previous) record.
-                 * Reset ep in case the extents array was re-alloced.
                 */
-                ep = xfs_iext_get_ext(ifp, lastx);
                if (bno != (xfs_fileoff_t)-1 && bno >= start) {
-                        if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
+                        if (lastx >= 0) {
-                            xfs_bmbt_get_startoff(ep) > bno) {
+                                ep = xfs_iext_get_ext(ifp, lastx);
-                                if (--lastx >= 0)
+                                if (xfs_bmbt_get_startoff(ep) > bno) {
-                                        ep = xfs_iext_get_ext(ifp, lastx);
+                                        if (--lastx >= 0)
-                        }
+                                                ep = xfs_iext_get_ext(ifp,
-                        if (lastx >= 0)
+                                                                      lastx);
+                                }
                                xfs_bmbt_get_all(ep, &got);
+                        }
                        extno++;
                }
        }
-        ifp->if_lastex = lastx;
        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
        ASSERT(ifp->if_ext_max ==
               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 3651191daea1..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,7 +69,6 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_ENTIRE        0x004   /* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA      0x008   /* mapping metadata not user data */
 #define XFS_BMAPI_ATTRFORK      0x010   /* use attribute fork not data */
-#define XFS_BMAPI_RSVBLOCKS     0x020   /* OK to alloc. reserved data blocks */
 #define XFS_BMAPI_PREALLOC      0x040   /* preallocation op: unwritten space */
 #define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
                                        /* combine contig. space */
@@ -87,7 +86,6 @@ typedef	struct xfs_bmap_free
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
        { XFS_BMAPI_METADATA,   "METADATA" }, \
        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
-        { XFS_BMAPI_RSVBLOCKS,  "RSVBLOCKS" }, \
        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
        { XFS_BMAPI_CONTIG,     "CONTIG" }, \
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index be628677c288..9a84a85c03b1 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -202,7 +202,7 @@ xfs_swap_extents(
        xfs_inode_t     *tip,   /* tmp inode */
        xfs_swapext_t   *sxp)
 {
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = ip->i_mount;
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
        int             taforkblks = 0;
        __uint64_t      tmp;
-        mp = ip->i_mount;
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = XFS_ERROR(ENOMEM);
                goto out;
        }
-        sbp = &sxp->sx_stat;
        /*
         * we have to do two separate lock calls here to keep lockdep
         * happy. If we try to get all the locks in one call, lock will
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a37480a6e023..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -920,7 +920,6 @@ xfs_iread_extents(
        /*
         * We know that the size is valid (it's checked in iformat_btree)
         */
-        ifp->if_lastex = NULLEXTNUM;
        ifp->if_bytes = ifp->if_real_bytes = 0;
        ifp->if_flags |= XFS_IFEXTENTS;
        xfs_iext_add(ifp, 0, nextents);
@@ -1354,7 +1353,7 @@ xfs_itruncate_start(
                return 0;
        }
        last_byte = xfs_file_last_byte(ip);
-        trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
+        trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
        if (last_byte > toss_start) {
                if (flags & XFS_ITRUNC_DEFINITE) {
                        xfs_tosspages(ip, toss_start,
@@ -1470,7 +1469,7 @@ xfs_itruncate_finish(
         * file but the log buffers containing the free and reallocation
         * don't, then we'd end up with garbage in the blocks being freed.
         * As long as we make the new_size permanent before actually
-         * freeing any blocks it doesn't matter if they get writtten to.
+         * freeing any blocks it doesn't matter if they get written to.
         *
         * The callers must signal into us whether or not the size
         * setting here must be synchronous.  There are a few cases
@@ -2558,12 +2557,9 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_EXTENTS:
                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
-                ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
-                        (ifp->if_bytes == 0));
-                ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
-                        (ifp->if_bytes > 0));
                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
+                        ASSERT(xfs_iext_get_ext(ifp, 0));
                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
                                whichfork);
@@ -3112,6 +3108,8 @@ xfs_iext_get_ext(
        xfs_extnum_t    idx)            /* index of target extent */
 {
        ASSERT(idx >= 0);
+        ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
                return ifp->if_u1.if_ext_irec->er_extbuf;
        } else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3191,7 +3189,6 @@ xfs_iext_add(
                }
                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
                ifp->if_real_bytes = 0;
-                ifp->if_lastex = nextents + ext_diff;
        }
        /*
         * Otherwise use a linear (direct) extent list.
@@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec(
        xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-        ASSERT(page_idx >= 0 && page_idx <=
+        ASSERT(page_idx >= 0);
-                ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+        ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+        ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
        erp_idx = 0;
        low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff4e2a30227d..3ae6d58e5473 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
-        xfs_extnum_t            if_lastex;      /* last if_extents used */
        union {
                xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
                xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 576fdfe81d60..09983a3344a5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -970,7 +970,6 @@ xfs_iflush_abort(
 {
        xfs_inode_log_item_t    *iip = ip->i_itemp;
-        iip = ip->i_itemp;
        if (iip) {
                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b612ce4520ae..211930246f20 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log)
        xlog_cil_destroy(log);
+        /*
+         * always need to ensure that the extra buffer does not point to memory
+         * owned by another log buffer before we free it.
+         */
+        xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+        xfs_buf_free(log->l_xbuf);
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
                xfs_buf_free(iclog->ic_bp);
@@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log)
        }
        spinlock_destroy(&log->l_icloglock);
-        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
        kmem_free(log);
 }       /* xlog_dealloc_log */
@@ -3248,13 +3254,6 @@ xfs_log_ticket_get(
        return ticket;
 }
-xlog_tid_t
-xfs_log_get_trans_ident(
-        struct xfs_trans        *tp)
-{
-        return tp->t_ticket->t_tid;
-}
 /*
 * Allocate and initialise a new log ticket.
 */
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 3bd3291ef8d2..78c9039994af 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -189,8 +189,6 @@ void	  xlog_iodone(struct xfs_buf *);
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
-xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
 void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9ca59be08977..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
+#include "xfs_discard.h"
 /*
 * Perform initial CIL structure initialisation. If the CIL is not
@@ -361,19 +362,28 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_busy_extent  *busyp, *n;
+        struct xfs_mount        *mp = ctx->cil->xc_log->l_mp;
        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
                                        ctx->start_lsn, abort);
-        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+        xfs_alloc_busy_sort(&ctx->busy_extents);
-                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+        xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+                             (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
        spin_lock(&ctx->cil->xc_cil_lock);
        list_del(&ctx->committing);
        spin_unlock(&ctx->cil->xc_cil_lock);
        xlog_cil_free_logvec(ctx->lv_chain);
+        if (!list_empty(&ctx->busy_extents)) {
+                ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+                xfs_discard_extents(mp, &ctx->busy_extents);
+                xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+        }
        kmem_free(ctx);
 }
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 5864850e9e34..2d3b6a498d63 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i)
                                           shutdown */
 #define XLOG_TAIL_WARN          0x10    /* log tail verify warning issued */
+typedef __uint32_t xlog_tid_t;
 #ifdef __KERNEL__
 /*
 * Below are states for covering allocation transactions.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5cc464a17c93..04142caedb2b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -205,6 +205,35 @@ xlog_bread(
 }
 /*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,         /* block to read from */
+        int             nbblks,         /* blocks to read */
+        xfs_buf_t       *bp,
+        xfs_caddr_t     offset)
+{
+        xfs_caddr_t     orig_offset = XFS_BUF_PTR(bp);
+        int             orig_len = bp->b_buffer_length;
+        int             error, error2;
+        error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+        if (error)
+                return error;
+        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+        /* must reset buffer pointer even on error */
+        error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+        if (error)
+                return error;
+        return error2;
+}
+/*
 * Write out the buffer at the given block for the given number of blocks.
 * The buffer is kept locked across the write and is returned locked.
 * This can only be used for synchronous log writes.
@@ -1229,20 +1258,12 @@ xlog_write_log_records(
                 */
                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
-                        offset = XFS_BUF_PTR(bp);
+                        offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
-                        balign = BBTOB(ealign - start_block);
+                        error = xlog_bread_offset(log, ealign, sectbb,
-                        error = XFS_BUF_SET_PTR(bp, offset + balign,
+                                                        bp, offset);
-                                                BBTOB(sectbb));
                        if (error)
                                break;
-                        error = xlog_bread_noalign(log, ealign, sectbb, bp);
-                        if (error)
-                                break;
-                        error = XFS_BUF_SET_PTR(bp, offset, bufblks);
-                        if (error)
-                                break;
                }
                offset = xlog_align(log, start_block, endcount, bp);
@@ -3448,19 +3469,9 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                wrapped_hblks = hblks - split_hblks;
-                                error = XFS_BUF_SET_PTR(hbp,
+                                error = xlog_bread_offset(log, 0,
-                                                offset + BBTOB(split_hblks),
+                                                wrapped_hblks, hbp,
-                                                BBTOB(hblks - split_hblks));
+                                                offset + BBTOB(split_hblks));
-                                if (error)
-                                        goto bread_err2;
-                                error = xlog_bread_noalign(log, 0,
-                                                           wrapped_hblks, hbp);
-                                if (error)
-                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(hbp, offset,
-                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
                        }
@@ -3511,19 +3522,9 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
-                                error = XFS_BUF_SET_PTR(dbp,
+                                error = xlog_bread_offset(log, 0,
-                                                offset + BBTOB(split_bblks),
+                                                bblks - split_bblks, hbp,
-                                                BBTOB(bblks - split_bblks));
+                                                offset + BBTOB(split_bblks));
-                                if (error)
-                                        goto bread_err2;
-                                error = xlog_bread_noalign(log, wrapped_hblks,
-                                                bblks - split_bblks,
-                                                dbp);
-                                if (error)
-                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(dbp, offset, h_size);
                                if (error)
                                        goto bread_err2;
                        }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb3f9a7b24ed..b49b82363d20 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch(
        uint                    nmsb,
        int                     rsvd)
 {
-        xfs_mod_sb_t            *msbp = &msb[0];
+        xfs_mod_sb_t            *msbp;
        int                     error = 0;
        /*
@@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch(
         * changes will be atomic.
         */
        spin_lock(&mp->m_sb_lock);
-        for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+        for (msbp = msb; msbp < (msb + nmsb); msbp++) {
                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
                       msbp->msb_field > XFS_SBS_FDBLOCKS);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19af0ab0d0c6..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
                                                   disk errors in metadata */
+#define XFS_MOUNT_DISCARD       (1ULL << 5)     /* discard unused blocks */
 #define XFS_MOUNT_RETERR        (1ULL << 6)     /* return alignment errors to
                                                   user */
 #define XFS_MOUNT_NOALIGN       (1ULL << 7)     /* turn off stripe alignment
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 76922793f64f..7c7bc2b786bd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -608,10 +608,8 @@ STATIC void
 xfs_trans_free(
        struct xfs_trans        *tp)
 {
-        struct xfs_busy_extent  *busyp, *n;
+        xfs_alloc_busy_sort(&tp->t_busy);
+        xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
-        list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
-                xfs_alloc_busy_clear(tp->t_mountp, busyp);
        atomic_dec(&tp->t_mountp->m_active_trans);
        xfs_trans_free_dqinfo(tp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 26d1867d8156..65584b55607d 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
-typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
 * Disk based types: