229 files changed, 8641 insertions, 6891 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 10b7d3c9dba8..8c92a9ba8330 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -259,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (v9fs_proto_dotl(v9ses)) {
                res = p9_client_statfs(fid, &rs);
                if (res == 0) {
-                        buf->f_type = V9FS_MAGIC;
+                        buf->f_type = rs.type;
                        buf->f_bsize = rs.bsize;
                        buf->f_blocks = rs.blocks;
                        buf->f_bfree = rs.bfree;
diff --git a/fs/aio.c b/fs/aio.c
index c7acaf3167aa..4f71627264fd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -13,7 +13,7 @@
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/aio_abi.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
diff --git a/fs/attr.c b/fs/attr.c
index 95053ad8abcc..73f69a6ce9ed 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -5,7 +5,7 @@
 *  changes by Thomas Schoebel-Theuer
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/string.h>
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 22e9a78872ff..37268c5bb98b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -9,7 +9,7 @@
 */
 #include <linux/fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/namei.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 18276531f7c6..7d7ff206cdcb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1094,6 +1094,29 @@ out:
 */
 /*
+ * The purpose of always_dump_vma() is to make sure that special kernel mappings
+ * that are useful for post-mortem analysis are included in every core dump.
+ * In that way we ensure that the core dump is fully interpretable later
+ * without matching up the same kernel and hardware config to see what PC values
+ * meant. These special mappings include - vDSO, vsyscall, and other
+ * architecture specific mappings
+ */
+static bool always_dump_vma(struct vm_area_struct *vma)
+{
+        /* Any vsyscall mappings? */
+        if (vma == get_gate_vma(vma->vm_mm))
+                return true;
+        /*
+         * arch_vma_name() returns non-NULL for special architecture mappings,
+         * such as vDSO sections.
+         */
+        if (arch_vma_name(vma))
+                return true;
+        return false;
+}
+/*
 * Decide what to dump of a segment, part, all or none.
 */
 static unsigned long vma_dump_size(struct vm_area_struct *vma,
@@ -1101,10 +1124,13 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 {
 #define FILTER(type)    (mm_flags & (1UL << MMF_DUMP_##type))
-        /* The vma can be set up to tell us the answer directly.  */
+        /* always dump the vdso and vsyscall sections */
-        if (vma->vm_flags & VM_ALWAYSDUMP)
+        if (always_dump_vma(vma))
                goto whole;
+        if (vma->vm_flags & VM_NODUMP)
+                return 0;
        /* Hugetlb memory check */
        if (vma->vm_flags & VM_HUGETLB) {
                if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 4e4017c08887..024d20ee3ca3 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -15,7 +15,7 @@
 *      JAN/99 -- coded full program relocation (gerg@snapgear.com)
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1ffb60355cae..613aa0618235 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/magic.h>
 #include <linux/binfmts.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
@@ -699,7 +700,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
                [3] = {"register", &bm_register_operations, S_IWUSR},
                /* last one */ {""}
        };
-        int err = simple_fill_super(sb, 0x42494e4d, bm_files);
+        int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
        if (!err)
                sb->s_op = &s_ops;
        return err;
diff --git a/fs/bio.c b/fs/bio.c
index b980ecde026a..e453924036e9 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -22,7 +22,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a9ff3000b83d..e08f6a20a5bb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -16,6 +16,7 @@
 #include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
+#include <linux/magic.h>
 #include <linux/buffer_head.h>
 #include <linux/swap.h>
 #include <linux/pagevec.h>
@@ -506,7 +507,7 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
+        return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
 }
 static struct file_system_type bd_type = {
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a30db77af32..70e2017edd70 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -29,7 +29,7 @@
 #include <linux/file.h>
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2c489378b4cd..9fff9f3b17e4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode,
        case S_IFLNK:
                inode->i_op = &ceph_symlink_iops;
                if (!ci->i_symlink) {
-                        int symlen = iinfo->symlink_len;
+                        u32 symlen = iinfo->symlink_len;
                        char *sym;
-                        BUG_ON(symlen != inode->i_size);
                        spin_unlock(&ci->i_ceph_lock);
+                        err = -EINVAL;
+                        if (WARN_ON(symlen != inode->i_size))
+                                goto out;
                        err = -ENOMEM;
-                        sym = kmalloc(symlen+1, GFP_NOFS);
+                        sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
                        if (!sym)
                                goto out;
-                        memcpy(sym, iinfo->symlink, symlen);
-                        sym[symlen] = 0;
                        spin_lock(&ci->i_ceph_lock);
                        if (!ci->i_symlink)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 866e8d7ca37d..89971e137aab 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -402,7 +402,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        spin_lock_init(&s->s_gen_ttl_lock);
        s->s_cap_gen = 0;
-        s->s_cap_ttl = 0;
+        s->s_cap_ttl = jiffies - 1;
        spin_lock_init(&s->s_cap_lock);
        s->s_renew_requested = 0;
@@ -1083,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
        int wake = 0;
        spin_lock(&session->s_cap_lock);
-        was_stale = is_renew && (session->s_cap_ttl == 0 ||
+        was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
-                                 time_after_eq(jiffies, session->s_cap_ttl));
        session->s_cap_ttl = session->s_renew_requested +
                mdsc->mdsmap->m_session_timeout*HZ;
@@ -2332,7 +2331,7 @@ static void handle_session(struct ceph_mds_session *session,
                        session->s_mds);
                spin_lock(&session->s_gen_ttl_lock);
                session->s_cap_gen++;
-                session->s_cap_ttl = 0;
+                session->s_cap_ttl = jiffies - 1;
                spin_unlock(&session->s_gen_ttl_lock);
                send_renew_caps(mdsc, session);
                break;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index a559c80f127a..f04c0961f993 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
        /* alloc new snap context */
        err = -ENOMEM;
-        if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+        if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))
                goto fail;
        snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
        if (!snapc)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 256f85221926..1e67dd7305a4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -130,10 +130,12 @@ enum {
        Opt_nodirstat,
        Opt_rbytes,
        Opt_norbytes,
+        Opt_asyncreaddir,
        Opt_noasyncreaddir,
        Opt_dcache,
        Opt_nodcache,
        Opt_ino32,
+        Opt_noino32,
 };
 static match_table_t fsopt_tokens = {
@@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = {
        {Opt_nodirstat, "nodirstat"},
        {Opt_rbytes, "rbytes"},
        {Opt_norbytes, "norbytes"},
+        {Opt_asyncreaddir, "asyncreaddir"},
        {Opt_noasyncreaddir, "noasyncreaddir"},
        {Opt_dcache, "dcache"},
        {Opt_nodcache, "nodcache"},
        {Opt_ino32, "ino32"},
+        {Opt_noino32, "noino32"},
        {-1, NULL}
 };
@@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private)
        case Opt_norbytes:
                fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
                break;
+        case Opt_asyncreaddir:
+                fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
+                break;
        case Opt_noasyncreaddir:
                fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
                break;
@@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private)
        case Opt_ino32:
                fsopt->flags |= CEPH_MOUNT_OPT_INO32;
                break;
+        case Opt_noino32:
+                fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+                break;
        default:
                BUG_ON(token);
        }
@@ -334,10 +344,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        *path += 2;
        dout("server path '%s'\n", *path);
-        err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+        *popt = ceph_parse_options(options, dev_name, dev_name_end,
                                 parse_fsopt_token, (void *)fsopt);
-        if (err)
+        if (IS_ERR(*popt)) {
+                err = PTR_ERR(*popt);
                goto out;
+        }
        /* success */
        *pfsopt = fsopt;
@@ -926,6 +938,7 @@ static int __init init_ceph(void)
        if (ret)
                goto out;
+        ceph_xattr_init();
        ret = register_filesystem(&ceph_fs_type);
        if (ret)
                goto out_icache;
@@ -935,6 +948,7 @@ static int __init init_ceph(void)
        return 0;
 out_icache:
+        ceph_xattr_exit();
        destroy_caches();
 out:
        return ret;
@@ -944,6 +958,7 @@ static void __exit exit_ceph(void)
 {
        dout("exit_ceph\n");
        unregister_filesystem(&ceph_fs_type);
+        ceph_xattr_exit();
        destroy_caches();
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1421f3d875a2..fc35036d258d 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
        u32 ino = vino & 0xffffffff;
        ino ^= vino >> 32;
        if (!ino)
-                ino = 1;
+                ino = 2;
        return ino;
 }
@@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern int ceph_removexattr(struct dentry *, const char *);
 extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+extern void __init ceph_xattr_init(void);
+extern void ceph_xattr_exit(void);
 /* caps.c */
 extern const char *ceph_cap_string(int c);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a76f697303d9..35b86331d8a5 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -8,9 +8,12 @@
 #include <linux/xattr.h>
 #include <linux/slab.h>
+#define XATTR_CEPH_PREFIX "ceph."
+#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
 static bool ceph_is_valid_xattr(const char *name)
 {
-        return !strncmp(name, "ceph.", 5) ||
+        return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
               !strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) ||
               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
@@ -21,79 +24,91 @@ static bool ceph_is_valid_xattr(const char *name)
 * These define virtual xattrs exposing the recursive directory
 * statistics and layout metadata.
 */
-struct ceph_vxattr_cb {
+struct ceph_vxattr {
-        bool readonly;
        char *name;
+        size_t name_size;       /* strlen(name) + 1 (for '\0') */
        size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
                              size_t size);
+        bool readonly;
 };
 /* directories */
-static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
                                        size_t size)
 {
        return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
 }
-static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
                                      size_t size)
 {
        return snprintf(val, size, "%lld", ci->i_files);
 }
-static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
                                        size_t size)
 {
        return snprintf(val, size, "%lld", ci->i_subdirs);
 }
-static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
                                         size_t size)
 {
        return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
 }
-static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
                                       size_t size)
 {
        return snprintf(val, size, "%lld", ci->i_rfiles);
 }
-static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
                                         size_t size)
 {
        return snprintf(val, size, "%lld", ci->i_rsubdirs);
 }
-static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
                                       size_t size)
 {
        return snprintf(val, size, "%lld", ci->i_rbytes);
 }
-static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
                                       size_t size)
 {
-        return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+        return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
                        (long)ci->i_rctime.tv_nsec);
 }
-static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+#define CEPH_XATTR_NAME(_type, _name)   XATTR_CEPH_PREFIX #_type "." #_name
-        { true, "ceph.dir.entries", ceph_vxattrcb_entries},
-        { true, "ceph.dir.files", ceph_vxattrcb_files},
+#define XATTR_NAME_CEPH(_type, _name) \
-        { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+                { \
-        { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
+                        .name = CEPH_XATTR_NAME(_type, _name), \
-        { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+                        .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
-        { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+                        .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
-        { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+                        .readonly = true, \
-        { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
+                }
-        { true, NULL, NULL }
+static struct ceph_vxattr ceph_dir_vxattrs[] = {
+        XATTR_NAME_CEPH(dir, entries),
+        XATTR_NAME_CEPH(dir, files),
+        XATTR_NAME_CEPH(dir, subdirs),
+        XATTR_NAME_CEPH(dir, rentries),
+        XATTR_NAME_CEPH(dir, rfiles),
+        XATTR_NAME_CEPH(dir, rsubdirs),
+        XATTR_NAME_CEPH(dir, rbytes),
+        XATTR_NAME_CEPH(dir, rctime),
+        { 0 }   /* Required table terminator */
 };
+static size_t ceph_dir_vxattrs_name_size;       /* total size of all names */
 /* files */
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
                                   size_t size)
 {
        int ret;
@@ -103,21 +118,32 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
                (unsigned long long)ceph_file_layout_su(ci->i_layout),
                (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
                (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-        if (ceph_file_layout_pg_preferred(ci->i_layout))
-                ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+        if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) {
+                val += ret;
+                size -= ret;
+                ret += snprintf(val, size, "preferred_osd=%lld\n",
                            (unsigned long long)ceph_file_layout_pg_preferred(
                                    ci->i_layout));
+        }
        return ret;
 }
-static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+static struct ceph_vxattr ceph_file_vxattrs[] = {
-        { true, "ceph.file.layout", ceph_vxattrcb_layout},
+        XATTR_NAME_CEPH(file, layout),
        /* The following extended attribute name is deprecated */
-        { true, "ceph.layout", ceph_vxattrcb_layout},
+        {
-        { true, NULL, NULL }
+                .name = XATTR_CEPH_PREFIX "layout",
+                .name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
+                .getxattr_cb = ceph_vxattrcb_file_layout,
+                .readonly = true,
+        },
+        { 0 }   /* Required table terminator */
 };
+static size_t ceph_file_vxattrs_name_size;      /* total size of all names */
-static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 {
        if (S_ISDIR(inode->i_mode))
                return ceph_dir_vxattrs;
@@ -126,14 +152,59 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
        return NULL;
 }
-static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+        if (vxattrs == ceph_dir_vxattrs)
+                return ceph_dir_vxattrs_name_size;
+        if (vxattrs == ceph_file_vxattrs)
+                return ceph_file_vxattrs_name_size;
+        BUG();
+        return 0;
+}
+/*
+ * Compute the aggregate size (including terminating '\0') of all
+ * virtual extended attribute names in the given vxattr table.
+ */
+static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+        struct ceph_vxattr *vxattr;
+        size_t size = 0;
+        for (vxattr = vxattrs; vxattr->name; vxattr++)
+                size += vxattr->name_size;
+        return size;
+}
+/* Routines called at initialization and exit time */
+void __init ceph_xattr_init(void)
+{
+        ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
+        ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
+}
+void ceph_xattr_exit(void)
+{
+        ceph_dir_vxattrs_name_size = 0;
+        ceph_file_vxattrs_name_size = 0;
+}
+static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
                                                const char *name)
 {
-        do {
+        struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
-                if (strcmp(vxattr->name, name) == 0)
-                        return vxattr;
+        if (vxattr) {
-                vxattr++;
+                while (vxattr->name) {
-        } while (vxattr->name);
+                        if (!strcmp(vxattr->name, name))
+                                return vxattr;
+                        vxattr++;
+                }
+        }
        return NULL;
 }
@@ -502,17 +573,15 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 {
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
        int err;
        struct ceph_inode_xattr *xattr;
-        struct ceph_vxattr_cb *vxattr = NULL;
+        struct ceph_vxattr *vxattr = NULL;
        if (!ceph_is_valid_xattr(name))
                return -ENODATA;
        /* let's see if a virtual xattr was requested */
-        if (vxattrs)
+        vxattr = ceph_match_vxattr(inode, name);
-                vxattr = ceph_match_vxattr(vxattrs, name);
        spin_lock(&ci->i_ceph_lock);
        dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -568,7 +637,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
        u32 vir_namelen = 0;
        u32 namelen;
        int err;
@@ -596,11 +665,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
                goto out;
 list_xattr:
-        vir_namelen = 0;
+        /*
-        /* include virtual dir xattrs */
+         * Start with virtual dir xattr names (if any) (including
-        if (vxattrs)
+         * terminating '\0' characters for each).
-                for (i = 0; vxattrs[i].name; i++)
+         */
-                        vir_namelen += strlen(vxattrs[i].name) + 1;
+        vir_namelen = ceph_vxattrs_name_size(vxattrs);
        /* adding 1 byte per each variable due to the null termination */
        namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
        err = -ERANGE;
@@ -698,17 +768,17 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
                  const void *value, size_t size, int flags)
 {
        struct inode *inode = dentry->d_inode;
+        struct ceph_vxattr *vxattr;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        int issued;
        int err;
+        int dirty;
        int name_len = strlen(name);
        int val_len = size;
        char *newname = NULL;
        char *newval = NULL;
        struct ceph_inode_xattr *xattr = NULL;
-        int issued;
        int required_blob_size;
-        int dirty;
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
@@ -716,12 +786,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
        if (!ceph_is_valid_xattr(name))
                return -EOPNOTSUPP;
-        if (vxattrs) {
+        vxattr = ceph_match_vxattr(inode, name);
-                struct ceph_vxattr_cb *vxattr =
+        if (vxattr && vxattr->readonly)
-                        ceph_match_vxattr(vxattrs, name);
+                return -EOPNOTSUPP;
-                if (vxattr && vxattr->readonly)
-                        return -EOPNOTSUPP;
-        }
        /* preallocate memory for xattr name, value, index node */
        err = -ENOMEM;
@@ -730,11 +797,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
                goto out;
        if (val_len) {
-                newval = kmalloc(val_len + 1, GFP_NOFS);
+                newval = kmemdup(value, val_len, GFP_NOFS);
                if (!newval)
                        goto out;
-                memcpy(newval, value, val_len);
-                newval[val_len] = '\0';
        }
        xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
@@ -744,6 +809,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
        spin_lock(&ci->i_ceph_lock);
 retry:
        issued = __ceph_caps_issued(ci, NULL);
+        dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
        if (!(issued & CEPH_CAP_XATTR_EXCL))
                goto do_sync;
        __build_xattrs(inode);
@@ -752,7 +818,7 @@ retry:
        if (!ci->i_xattrs.prealloc_blob ||
            required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
-                struct ceph_buffer *blob = NULL;
+                struct ceph_buffer *blob;
                spin_unlock(&ci->i_ceph_lock);
                dout(" preaallocating new blob size=%d\n", required_blob_size);
@@ -766,12 +832,13 @@ retry:
                goto retry;
        }
-        dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
        err = __set_xattr(ci, newname, name_len, newval,
                          val_len, 1, 1, 1, &xattr);
        dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
        ci->i_xattrs.dirty = true;
        inode->i_ctime = CURRENT_TIME;
        spin_unlock(&ci->i_ceph_lock);
        if (dirty)
                __mark_inode_dirty(inode, dirty);
@@ -816,8 +883,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 int ceph_removexattr(struct dentry *dentry, const char *name)
 {
        struct inode *inode = dentry->d_inode;
+        struct ceph_vxattr *vxattr;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
        int issued;
        int err;
        int required_blob_size;
@@ -829,22 +896,19 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
        if (!ceph_is_valid_xattr(name))
                return -EOPNOTSUPP;
-        if (vxattrs) {
+        vxattr = ceph_match_vxattr(inode, name);
-                struct ceph_vxattr_cb *vxattr =
+        if (vxattr && vxattr->readonly)
-                        ceph_match_vxattr(vxattrs, name);
+                return -EOPNOTSUPP;
-                if (vxattr && vxattr->readonly)
-                        return -EOPNOTSUPP;
-        }
        err = -ENOMEM;
        spin_lock(&ci->i_ceph_lock);
-        __build_xattrs(inode);
 retry:
        issued = __ceph_caps_issued(ci, NULL);
        dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
        if (!(issued & CEPH_CAP_XATTR_EXCL))
                goto do_sync;
+        __build_xattrs(inode);
        required_blob_size = __get_required_blob_size(ci, 0, 0);
@@ -865,10 +929,10 @@ retry:
        }
        err = __remove_xattr_by_name(ceph_inode(inode), name);
        dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
        ci->i_xattrs.dirty = true;
        inode->i_ctime = CURRENT_TIME;
        spin_unlock(&ci->i_ceph_lock);
        if (dirty)
                __mark_inode_dirty(inode, dirty);
diff --git a/fs/cifs/README b/fs/cifs/README
index 895da1dc1550..b7d782bab797 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -753,10 +753,6 @@ module loading or during the runtime by using the interface
 i.e. echo "value" > /sys/module/cifs/parameters/<param>
-1. echo_retries - The number of echo attempts before giving up and
+1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
-                  reconnecting to the server. The default is 5. The value 0
-                  means never reconnect.
-2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
                    [Y/y/1]. To disable use any of [N/n/0].
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 24b3dfc05282..573b899b5a5d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -171,8 +171,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                        seq_printf(m, "TCP status: %d\n\tLocal Users To "
                                   "Server: %d SecMode: 0x%x Req On Wire: %d",
                                   server->tcpStatus, server->srv_count,
-                                   server->sec_mode,
+                                   server->sec_mode, in_flight(server));
-                                   atomic_read(&server->inFlight));
 #ifdef CONFIG_CIFS_STATS2
                        seq_printf(m, " In Send: %d In MaxReq Wait: %d",
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 418fc42fb8b2..eee522c56ef0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -76,12 +76,7 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0444);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
-                                   "Default: 50 Range: 2 to 256");
+                                   "Default: 32767 Range: 2 to 32767.");
-unsigned short echo_retries = 5;
-module_param(echo_retries, ushort, 0644);
-MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
-                               "reconnecting server. Default: 5. 0 means "
-                               "never reconnect.");
 module_param(enable_oplocks, bool, 0644);
 MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
                                 "y/Y/1");
@@ -1111,9 +1106,9 @@ init_cifs(void)
        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
                cFYI(1, "cifs_max_pending set to min of 2");
-        } else if (cifs_max_pending > 256) {
+        } else if (cifs_max_pending > CIFS_MAX_REQ) {
-                cifs_max_pending = 256;
+                cifs_max_pending = CIFS_MAX_REQ;
-                cFYI(1, "cifs_max_pending set to max of 256");
+                cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);
        }
        rc = cifs_fscache_register();
@@ -1175,11 +1170,8 @@ static void __exit
 exit_cifs(void)
 {
        cFYI(DBG2, "exit_cifs");
-        cifs_proc_clean();
+        unregister_filesystem(&cifs_fs_type);
-        cifs_fscache_unregister();
-#ifdef CONFIG_CIFS_DFS_UPCALL
        cifs_dfs_release_automount_timer();
-#endif
 #ifdef CONFIG_CIFS_ACL
        cifs_destroy_idmaptrees();
        exit_cifs_idmap();
@@ -1187,10 +1179,11 @@ exit_cifs(void)
 #ifdef CONFIG_CIFS_UPCALL
        unregister_key_type(&cifs_spnego_key_type);
 #endif
-        unregister_filesystem(&cifs_fs_type);
-        cifs_destroy_inodecache();
-        cifs_destroy_mids();
        cifs_destroy_request_bufs();
+        cifs_destroy_mids();
+        cifs_destroy_inodecache();
+        cifs_fscache_unregister();
+        cifs_proc_clean();
 }
 MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 76e7d8b6da17..339ebe3ebc0d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -55,14 +55,9 @@
 /*
 * MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurrently. It also matches the most common
+ * on one socket concurrently.
- * value of max multiplex returned by servers.  We may
- * eventually want to use the negotiated value (in case
- * future servers can handle more) when we are more confident that
- * we will not have problems oveloading the socket with pending
- * write data.
 */
-#define CIFS_MAX_REQ 50
+#define CIFS_MAX_REQ 32767
 #define RFC1001_NAME_LEN 15
 #define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
@@ -255,7 +250,9 @@ struct TCP_Server_Info {
        bool noblocksnd;                /* use blocking sendmsg */
        bool noautotune;                /* do not autotune send buf sizes */
        bool tcp_nodelay;
-        atomic_t inFlight;  /* number of requests on the wire to server */
+        int credits;  /* send no more requests at once */
+        unsigned int in_flight;  /* number of requests on the wire to server */
+        spinlock_t req_lock;  /* protect the two values above */
        struct mutex srv_mutex;
        struct task_struct *tsk;
        char server_GUID[16];
@@ -263,6 +260,7 @@ struct TCP_Server_Info {
        bool session_estab; /* mark when very first sess is established */
        u16 dialect; /* dialect index that server chose */
        enum securityEnum secType;
+        bool oplocks:1; /* enable oplocks */
        unsigned int maxReq;    /* Clients should submit no more */
        /* than maxReq distinct unanswered SMBs to the server when using  */
        /* multiplexed reads or writes */
@@ -307,6 +305,36 @@ struct TCP_Server_Info {
 #endif
 };
+static inline unsigned int
+in_flight(struct TCP_Server_Info *server)
+{
+        unsigned int num;
+        spin_lock(&server->req_lock);
+        num = server->in_flight;
+        spin_unlock(&server->req_lock);
+        return num;
+}
+static inline int*
+get_credits_field(struct TCP_Server_Info *server)
+{
+        /*
+         * This will change to switch statement when we reserve slots for echos
+         * and oplock breaks.
+         */
+        return &server->credits;
+}
+static inline bool
+has_credits(struct TCP_Server_Info *server, int *credits)
+{
+        int num;
+        spin_lock(&server->req_lock);
+        num = *credits;
+        spin_unlock(&server->req_lock);
+        return num > 0;
+}
 /*
 * Macros to allow the TCP_Server_Info->net field and related code to drop out
 * when CONFIG_NET_NS isn't set.
@@ -1010,9 +1038,6 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
-/* reconnect after this many failed echo attempts */
-GLOBAL_EXTERN unsigned short echo_retries;
 #ifdef CONFIG_CIFS_ACL
 GLOBAL_EXTERN struct rb_root uidtree;
 GLOBAL_EXTERN struct rb_root gidtree;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6f4e243e0f62..503e73d8bdb7 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -88,6 +88,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
                        struct smb_hdr *in_buf ,
                        struct smb_hdr *out_buf,
                        int *bytes_returned);
+extern void cifs_add_credits(struct TCP_Server_Info *server,
+                             const unsigned int add);
+extern void cifs_set_credits(struct TCP_Server_Info *server, const int val);
 extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
@@ -168,7 +171,13 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data,
                                            const char *devname);
 extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
 extern void cifs_umount(struct cifs_sb_info *);
+#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
 extern void cifs_dfs_release_automount_timer(void);
+#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+#define cifs_dfs_release_automount_timer()      do { } while (0)
+#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 8b7794c31591..70aac35c398f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -458,7 +458,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
                        goto neg_err_exit;
                }
                server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
-                server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
+                server->maxReq = min_t(unsigned int,
+                                       le16_to_cpu(rsp->MaxMpxCount),
+                                       cifs_max_pending);
+                cifs_set_credits(server, server->maxReq);
                server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
                /* even though we do not use raw we might as well set this
@@ -564,7 +567,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
        /* one byte, so no need to convert this or EncryptionKeyLen from
           little endian */
-        server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
+        server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
+                               cifs_max_pending);
+        cifs_set_credits(server, server->maxReq);
        /* probably no need to store and check maxvcs */
        server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
@@ -716,8 +721,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
        struct TCP_Server_Info *server = mid->callback_data;
        DeleteMidQEntry(mid);
-        atomic_dec(&server->inFlight);
+        cifs_add_credits(server, 1);
-        wake_up(&server->request_q);
 }
 int
@@ -1669,8 +1673,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
        queue_work(system_nrt_wq, &rdata->work);
        DeleteMidQEntry(mid);
-        atomic_dec(&server->inFlight);
+        cifs_add_credits(server, 1);
-        wake_up(&server->request_q);
 }
 /* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -2110,8 +2113,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
        queue_work(system_nrt_wq, &wdata->work);
        DeleteMidQEntry(mid);
-        atomic_dec(&tcon->ses->server->inFlight);
+        cifs_add_credits(tcon->ses->server, 1);
-        wake_up(&tcon->ses->server->request_q);
 }
 /* cifs_async_writev - send an async write, and set up mid to handle result */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 602f77c304c9..5560e1d5e54b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -373,12 +373,22 @@ allocate_buffers(struct TCP_Server_Info *server)
 static bool
 server_unresponsive(struct TCP_Server_Info *server)
 {
-        if (echo_retries > 0 && server->tcpStatus == CifsGood &&
+        /*
-            time_after(jiffies, server->lstrp +
+         * We need to wait 2 echo intervals to make sure we handle such
-                                (echo_retries * SMB_ECHO_INTERVAL))) {
+         * situations right:
+         * 1s  client sends a normal SMB request
+         * 2s  client gets a response
+         * 30s echo workqueue job pops, and decides we got a response recently
+         *     and don't need to send another
+         * ...
+         * 65s kernel_recvmsg times out, and we see that we haven't gotten
+         *     a response in >60s.
+         */
+        if (server->tcpStatus == CifsGood &&
+            time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
                cERROR(1, "Server %s has not responded in %d seconds. "
                          "Reconnecting...", server->hostname,
-                          (echo_retries * SMB_ECHO_INTERVAL / HZ));
+                          (2 * SMB_ECHO_INTERVAL) / HZ);
                cifs_reconnect(server);
                wake_up(&server->response_q);
                return true;
@@ -642,19 +652,11 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
        spin_unlock(&GlobalMid_Lock);
        wake_up_all(&server->response_q);
-        /*
+        /* check if we have blocked requests that need to free */
-         * Check if we have blocked requests that need to free. Note that
+        spin_lock(&server->req_lock);
-         * cifs_max_pending is normally 50, but can be set at module install
+        if (server->credits <= 0)
-         * time to as little as two.
+                server->credits = 1;
-         */
+        spin_unlock(&server->req_lock);
-        spin_lock(&GlobalMid_Lock);
-        if (atomic_read(&server->inFlight) >= cifs_max_pending)
-                atomic_set(&server->inFlight, cifs_max_pending - 1);
-        /*
-         * We do not want to set the max_pending too low or we could end up
-         * with the counter going negative.
-         */
-        spin_unlock(&GlobalMid_Lock);
        /*
         * Although there should not be any requests blocked on this queue it
         * can not hurt to be paranoid and try to wake up requests that may
@@ -1909,7 +1911,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        tcp_ses->noblocksnd = volume_info->noblocksnd;
        tcp_ses->noautotune = volume_info->noautotune;
        tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
-        atomic_set(&tcp_ses->inFlight, 0);
+        tcp_ses->in_flight = 0;
+        tcp_ses->credits = 1;
        init_waitqueue_head(&tcp_ses->response_q);
        init_waitqueue_head(&tcp_ses->request_q);
        INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
@@ -3371,7 +3374,7 @@ cifs_ra_pages(struct cifs_sb_info *cifs_sb)
 int
 cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 {
-        int rc = 0;
+        int rc;
        int xid;
        struct cifs_ses *pSesInfo;
        struct cifs_tcon *tcon;
@@ -3398,6 +3401,7 @@ try_mount_again:
                FreeXid(xid);
        }
 #endif
+        rc = 0;
        tcon = NULL;
        pSesInfo = NULL;
        srvTcp = NULL;
@@ -3759,9 +3763,11 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
        if (server->maxBuf != 0)
                return 0;
+        cifs_set_credits(server, 1);
        rc = CIFSSMBNegotiate(xid, ses);
        if (rc == -EAGAIN) {
                /* retry only once on 1st time connection */
+                cifs_set_credits(server, 1);
                rc = CIFSSMBNegotiate(xid, ses);
                if (rc == -EAGAIN)
                        rc = -EHOSTDOWN;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index bc7e24420ac0..d172c8ed9017 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
        }
        tcon = tlink_tcon(tlink);
-        if (enable_oplocks)
+        if (tcon->ses->server->oplocks)
                oplock = REQ_OPLOCK;
        if (nd)
@@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 {
        int xid;
        int rc = 0; /* to get around spurious gcc warning, set to zero here */
-        __u32 oplock = enable_oplocks ? REQ_OPLOCK : 0;
+        __u32 oplock;
        __u16 fileHandle = 0;
        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
@@ -518,6 +518,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        }
        pTcon = tlink_tcon(tlink);
+        oplock = pTcon->ses->server->oplocks ? REQ_OPLOCK : 0;
        /*
         * Don't allow the separator character in a path component.
         * The VFS will not allow "/", but "\" is allowed by posix.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5e64748a2917..159fcc56dc2d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -380,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
        cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
                 inode, file->f_flags, full_path);
-        if (enable_oplocks)
+        if (tcon->ses->server->oplocks)
                oplock = REQ_OPLOCK;
        else
                oplock = 0;
@@ -505,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
        cFYI(1, "inode = 0x%p file flags 0x%x for %s",
                 inode, pCifsFile->f_flags, full_path);
-        if (enable_oplocks)
+        if (tcon->ses->server->oplocks)
                oplock = REQ_OPLOCK;
        else
                oplock = 0;
@@ -960,9 +960,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        INIT_LIST_HEAD(&locks_to_send);
        /*
-         * Allocating count locks is enough because no locks can be added to
+         * Allocating count locks is enough because no FL_POSIX locks can be
-         * the list while we are holding cinode->lock_mutex that protects
+         * added to the list while we are holding cinode->lock_mutex that
-         * locking operations of this inode.
+         * protects locking operations of this inode.
         */
        for (; i < count; i++) {
                lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
@@ -973,18 +973,20 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
                list_add_tail(&lck->llist, &locks_to_send);
        }
-        i = 0;
        el = locks_to_send.next;
        lock_flocks();
        cifs_for_each_lock(cfile->dentry->d_inode, before) {
+                flock = *before;
+                if ((flock->fl_flags & FL_POSIX) == 0)
+                        continue;
                if (el == &locks_to_send) {
-                        /* something is really wrong */
+                        /*
+                         * The list ended. We don't have enough allocated
+                         * structures - something is really wrong.
+                         */
                        cERROR(1, "Can't push all brlocks!");
                        break;
                }
-                flock = *before;
-                if ((flock->fl_flags & FL_POSIX) == 0)
-                        continue;
                length = 1 + flock->fl_end - flock->fl_start;
                if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
                        type = CIFS_RDLCK;
@@ -996,7 +998,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
                lck->length = length;
                lck->type = type;
                lck->offset = flock->fl_start;
-                i++;
                el = el->next;
        }
        unlock_flocks();
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 703ef5c6fdb1..c273c12de98e 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -690,3 +690,22 @@ backup_cred(struct cifs_sb_info *cifs_sb)
        return false;
 }
+void
+cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add)
+{
+        spin_lock(&server->req_lock);
+        server->credits += add;
+        server->in_flight--;
+        spin_unlock(&server->req_lock);
+        wake_up(&server->request_q);
+}
+void
+cifs_set_credits(struct TCP_Server_Info *server, const int val)
+{
+        spin_lock(&server->req_lock);
+        server->credits = val;
+        server->oplocks = val > 1 ? enable_oplocks : false;
+        spin_unlock(&server->req_lock);
+}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0cc9584f5889..310918b6fcb4 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -254,44 +254,60 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
        return smb_sendv(server, &iov, 1);
 }
-static int wait_for_free_request(struct TCP_Server_Info *server,
+static int
-                                 const int long_op)
+wait_for_free_credits(struct TCP_Server_Info *server, const int optype,
+                      int *credits)
 {
-        if (long_op == CIFS_ASYNC_OP) {
+        int rc;
+        spin_lock(&server->req_lock);
+        if (optype == CIFS_ASYNC_OP) {
                /* oplock breaks must not be held up */
-                atomic_inc(&server->inFlight);
+                server->in_flight++;
+                *credits -= 1;
+                spin_unlock(&server->req_lock);
                return 0;
        }
-        spin_lock(&GlobalMid_Lock);
        while (1) {
-                if (atomic_read(&server->inFlight) >= cifs_max_pending) {
+                if (*credits <= 0) {
-                        spin_unlock(&GlobalMid_Lock);
+                        spin_unlock(&server->req_lock);
                        cifs_num_waiters_inc(server);
-                        wait_event(server->request_q,
+                        rc = wait_event_killable(server->request_q,
-                                   atomic_read(&server->inFlight)
+                                                 has_credits(server, credits));
-                                     < cifs_max_pending);
                        cifs_num_waiters_dec(server);
-                        spin_lock(&GlobalMid_Lock);
+                        if (rc)
+                                return rc;
+                        spin_lock(&server->req_lock);
                } else {
                        if (server->tcpStatus == CifsExiting) {
-                                spin_unlock(&GlobalMid_Lock);
+                                spin_unlock(&server->req_lock);
                                return -ENOENT;
                        }
-                        /* can not count locking commands against total
+                        /*
-                           as they are allowed to block on server */
+                         * Can not count locking commands against total
+                         * as they are allowed to block on server.
+                         */
                        /* update # of requests on the wire to server */
-                        if (long_op != CIFS_BLOCKING_OP)
+                        if (optype != CIFS_BLOCKING_OP) {
-                                atomic_inc(&server->inFlight);
+                                *credits -= 1;
-                        spin_unlock(&GlobalMid_Lock);
+                                server->in_flight++;
+                        }
+                        spin_unlock(&server->req_lock);
                        break;
                }
        }
        return 0;
 }
+static int
+wait_for_free_request(struct TCP_Server_Info *server, const int optype)
+{
+        return wait_for_free_credits(server, optype, get_credits_field(server));
+}
 static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
                        struct mid_q_entry **ppmidQ)
 {
@@ -359,7 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
        mid = AllocMidQEntry(hdr, server);
        if (mid == NULL) {
                mutex_unlock(&server->srv_mutex);
-                atomic_dec(&server->inFlight);
+                cifs_add_credits(server, 1);
                wake_up(&server->request_q);
                return -ENOMEM;
        }
@@ -392,7 +408,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
        return rc;
 out_err:
        delete_mid(mid);
-        atomic_dec(&server->inFlight);
+        cifs_add_credits(server, 1);
        wake_up(&server->request_q);
        return rc;
 }
@@ -564,8 +580,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
                mutex_unlock(&ses->server->srv_mutex);
                cifs_small_buf_release(in_buf);
                /* Update # of requests on wire to server */
-                atomic_dec(&ses->server->inFlight);
+                cifs_add_credits(ses->server, 1);
-                wake_up(&ses->server->request_q);
                return rc;
        }
        rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
@@ -601,8 +616,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
                        midQ->callback = DeleteMidQEntry;
                        spin_unlock(&GlobalMid_Lock);
                        cifs_small_buf_release(in_buf);
-                        atomic_dec(&ses->server->inFlight);
+                        cifs_add_credits(ses->server, 1);
-                        wake_up(&ses->server->request_q);
                        return rc;
                }
                spin_unlock(&GlobalMid_Lock);
@@ -612,8 +626,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0) {
-                atomic_dec(&ses->server->inFlight);
+                cifs_add_credits(ses->server, 1);
-                wake_up(&ses->server->request_q);
                return rc;
        }
@@ -637,8 +650,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
                midQ->resp_buf = NULL;
 out:
        delete_mid(midQ);
-        atomic_dec(&ses->server->inFlight);
+        cifs_add_credits(ses->server, 1);
-        wake_up(&ses->server->request_q);
        return rc;
 }
@@ -688,8 +700,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
        if (rc) {
                mutex_unlock(&ses->server->srv_mutex);
                /* Update # of requests on wire to server */
-                atomic_dec(&ses->server->inFlight);
+                cifs_add_credits(ses->server, 1);
-                wake_up(&ses->server->request_q);
                return rc;
        }
@@ -721,8 +732,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
                        /* no longer considered to be "in-flight" */
                        midQ->callback = DeleteMidQEntry;
                        spin_unlock(&GlobalMid_Lock);
-                        atomic_dec(&ses->server->inFlight);
+                        cifs_add_credits(ses->server, 1);
-                        wake_up(&ses->server->request_q);
                        return rc;
                }
                spin_unlock(&GlobalMid_Lock);
@@ -730,8 +740,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0) {
-                atomic_dec(&ses->server->inFlight);
+                cifs_add_credits(ses->server, 1);
-                wake_up(&ses->server->request_q);
                return rc;
        }
@@ -747,8 +756,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
        rc = cifs_check_receive(midQ, ses->server, 0);
 out:
        delete_mid(midQ);
-        atomic_dec(&ses->server->inFlight);
+        cifs_add_credits(ses->server, 1);
-        wake_up(&ses->server->request_q);
        return rc;
 }
diff --git a/fs/compat.c b/fs/compat.c
index 07880bae28a9..14483a715bbb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -33,7 +33,6 @@
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
-#include <linux/module.h>
 #include <linux/dirent.h>
 #include <linux/fsnotify.h>
 #include <linux/highuid.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 10d8cd90ca6f..debdfe0fc809 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -49,7 +49,6 @@
 #include <linux/elevator.h>
 #include <linux/rtc.h>
 #include <linux/pci.h>
-#include <linux/module.h>
 #include <linux/serial.h>
 #include <linux/if_tun.h>
 #include <linux/ctype.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 2b55bd0c1061..b60ddc41d783 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -23,7 +23,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/cache.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/file.h>
 #include <asm/uaccess.h>
@@ -2404,6 +2404,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                        if (d_ancestor(alias, dentry)) {
                                /* Check for loops */
                                actual = ERR_PTR(-ELOOP);
+                                spin_unlock(&inode->i_lock);
                        } else if (IS_ROOT(alias)) {
                                /* Is this an anonymous mountpoint that we
                                 * could splice into our tree? */
@@ -2413,7 +2414,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                                goto found;
                        } else {
                                /* Nope, but we must(!) avoid directory
-                                 * aliasing */
+                                 * aliasing. This drops inode->i_lock */
                                actual = __d_unalias(inode, dentry, alias);
                        }
                        write_sequnlock(&rename_lock);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index dda0dc702d1b..17c779967828 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -13,7 +13,7 @@
 */
 #include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/mount.h>
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d9a591773919..dba15fecf23e 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,7 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
 #include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kref.h>
 #include <linux/eventfd.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2a7dcd6ddc09..739b0985b398 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -426,6 +426,31 @@ out_unlock:
        return error;
 }
+/*
+ * As described in commit 0ccf831cb lockdep: annotate epoll
+ * the use of wait queues used by epoll is done in a very controlled
+ * manner. Wake ups can nest inside each other, but are never done
+ * with the same locking. For example:
+ *
+ *   dfd = socket(...);
+ *   efd1 = epoll_create();
+ *   efd2 = epoll_create();
+ *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
+ *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
+ *
+ * When a packet arrives to the device underneath "dfd", the net code will
+ * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
+ * callback wakeup entry on that queue, and the wake_up() performed by the
+ * "dfd" net code will end up in ep_poll_callback(). At this point epoll
+ * (efd1) notices that it may have some event ready, so it needs to wake up
+ * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
+ * that ends up in another wake_up(), after having checked about the
+ * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
+ * avoid stack blasting.
+ *
+ * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
+ * this special case of epoll.
+ */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
                                     unsigned long events, int subclass)
@@ -698,9 +723,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
                               void *priv)
 {
        struct epitem *epi, *tmp;
+        poll_table pt;
+        init_poll_funcptr(&pt, NULL);
        list_for_each_entry_safe(epi, tmp, head, rdllink) {
-                if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+                pt._key = epi->event.events;
+                if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
                    epi->event.events)
                        return POLLIN | POLLRDNORM;
                else {
@@ -1048,13 +1076,11 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
 */
 static int reverse_path_check(void)
 {
-        int length = 0;
        int error = 0;
        struct file *current_file;
        /* let's call this for all tfiles */
        list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
-                length++;
                path_count_init();
                error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
                                        reverse_path_check_proc, current_file,
@@ -1096,6 +1122,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        /* Initialize the poll table using the queue callback */
        epq.epi = epi;
        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
+        epq.pt._key = event->events;
        /*
         * Attach the item to the poll hooks and get current event bits.
@@ -1190,6 +1217,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 {
        int pwake = 0;
        unsigned int revents;
+        poll_table pt;
+        init_poll_funcptr(&pt, NULL);
        /*
         * Set the new event interest mask before calling f_op->poll();
@@ -1197,13 +1227,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
         * f_op->poll() call and the new event set registering.
         */
        epi->event.events = event->events;
+        pt._key = event->events;
        epi->event.data = event->data; /* protected by mtx */
        /*
         * Get current event bits. We can safely use the file* here because
         * its usage count has been increased by the caller of this function.
         */
-        revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+        revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
        /*
         * If the item is "hot" and it is not registered inside the ready
@@ -1238,6 +1269,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
        unsigned int revents;
        struct epitem *epi;
        struct epoll_event __user *uevent;
+        poll_table pt;
+        init_poll_funcptr(&pt, NULL);
        /*
         * We can loop without lock because we are passed a task private list.
@@ -1250,7 +1284,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
                list_del_init(&epi->rdllink);
-                revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+                pt._key = epi->event.events;
+                revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
                        epi->event.events;
                /*
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a2038928f9a3..1e036b79384c 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1743,8 +1743,11 @@ allocated:
        *errp = 0;
        brelse(bitmap_bh);
-        dquot_free_block(inode, *count-num);
-        *count = num;
+        if (num < *count) {
+                dquot_free_block(inode, *count-num);
+                *count = num;
+        }
        trace_ext3_allocate_blocks(inode, goal, num,
                                   (unsigned long long)ret_block);
@@ -1970,7 +1973,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
        sbi = EXT3_SB(sb);
         /* Walk through the whole group */
-        while (start < max) {
+        while (start <= max) {
                start = bitmap_search_next_usable_block(start, bitmap_bh, max);
                if (start < 0)
                        break;
@@ -1980,7 +1983,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
                 * Allocate contiguous free extents by setting bits in the
                 * block bitmap
                 */
-                while (next < max
+                while (next <= max
                        && claim_block(sb_bgl_lock(sbi, group),
                                        next, bitmap_bh)) {
                        next++;
@@ -2091,73 +2094,74 @@ err_out:
 */
 int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
-        ext3_grpblk_t last_block, first_block, free_blocks;
+        ext3_grpblk_t last_block, first_block;
-        unsigned long first_group, last_group;
+        unsigned long group, first_group, last_group;
-        unsigned long group, ngroups;
        struct ext3_group_desc *gdp;
        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-        uint64_t start, len, minlen, trimmed;
+        uint64_t start, minlen, end, trimmed = 0;
+        ext3_fsblk_t first_data_blk =
+                        le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
        int ret = 0;
-        start = (range->start >> sb->s_blocksize_bits) +
+        start = range->start >> sb->s_blocksize_bits;
-                le32_to_cpu(es->s_first_data_block);
+        end = start + (range->len >> sb->s_blocksize_bits) - 1;
-        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        trimmed = 0;
-        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) ||
+            unlikely(start >= max_blks))
                return -EINVAL;
-        if (start >= max_blks)
+        if (end >= max_blks)
-                return -EINVAL;
+                end = max_blks - 1;
-        if (start + len > max_blks)
+        if (end <= first_data_blk)
-                len = max_blks - start;
+                goto out;
+        if (start < first_data_blk)
+                start = first_data_blk;
-        ngroups = EXT3_SB(sb)->s_groups_count;
        smp_rmb();
        /* Determine first and last group to examine based on start and len */
        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
                                     &first_group, &first_block);
-        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
                                     &last_group, &last_block);
-        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
-        last_block = EXT3_BLOCKS_PER_GROUP(sb);
-        if (first_group > last_group)
+        /* end now represents the last block to discard in this group */
-                return -EINVAL;
+        end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
        for (group = first_group; group <= last_group; group++) {
                gdp = ext3_get_group_desc(sb, group, NULL);
                if (!gdp)
                        break;
-                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-                if (free_blocks < minlen)
-                        continue;
                /*
                 * For all the groups except the last one, last block will
-                 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
+                 * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
-                 * change it for the last group in which case first_block +
+                 * change it for the last group, note that last_block is
-                 * len < EXT3_BLOCKS_PER_GROUP(sb).
+                 * already computed earlier by ext3_get_group_no_and_offset()
                 */
-                if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
+                if (group == last_group)
-                        last_block = first_block + len;
+                        end = last_block;
-                len -= last_block - first_block;
-                ret = ext3_trim_all_free(sb, group, first_block,
+                if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
-                                        last_block, minlen);
+                        ret = ext3_trim_all_free(sb, group, first_block,
-                if (ret < 0)
+                                                 end, minlen);
-                        break;
+                        if (ret < 0)
+                                break;
+                        trimmed += ret;
+                }
-                trimmed += ret;
+                /*
+                 * For every group except the first one, we are sure
+                 * that the first block to discard will be block #0.
+                 */
                first_block = 0;
        }
-        if (ret >= 0)
+        if (ret > 0)
                ret = 0;
-        range->len = trimmed * sb->s_blocksize;
+out:
+        range->len = trimmed * sb->s_blocksize;
        return ret;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2d0afeca0b47..6d3418662b54 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -756,6 +756,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
        struct ext3_block_alloc_info *block_i;
        ext3_fsblk_t current_block;
        struct ext3_inode_info *ei = EXT3_I(inode);
+        struct timespec now;
        block_i = ei->i_block_alloc_info;
        /*
@@ -795,9 +796,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
        }
        /* We are done with atomic stuff, now do the rest of housekeeping */
+        now = CURRENT_TIME_SEC;
-        inode->i_ctime = CURRENT_TIME_SEC;
+        if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
-        ext3_mark_inode_dirty(handle, inode);
+                inode->i_ctime = now;
+                ext3_mark_inode_dirty(handle, inode);
+        }
        /* ext3_mark_inode_dirty already updated i_sync_tid */
        atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9e2cd8cf711..4bbd07a6fa18 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -336,10 +336,10 @@ err_out:
 * Return buffer_head on success or NULL in case of failure.
 */
 struct buffer_head *
-ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 {
        struct ext4_group_desc *desc;
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh;
        ext4_fsblk_t bitmap_blk;
        desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        bitmap_blk = ext4_block_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
-                ext4_error(sb, "Cannot read block bitmap - "
+                ext4_error(sb, "Cannot get buffer for block bitmap - "
-                            "block_group = %u, block_bitmap = %llu",
+                           "block_group = %u, block_bitmap = %llu",
-                            block_group, bitmap_blk);
+                           block_group, bitmap_blk);
                return NULL;
        }
@@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        /*
-         * submit the buffer_head for read. We can
+         * submit the buffer_head for reading
-         * safely mark the bitmap as uptodate now.
-         * We do it here so the bitmap uptodate bit
-         * get set with buffer lock held.
         */
+        set_buffer_new(bh);
        trace_ext4_read_block_bitmap_load(sb, block_group);
-        set_bitmap_uptodate(bh);
+        bh->b_end_io = ext4_end_bitmap_read;
-        if (bh_submit_read(bh) < 0) {
+        get_bh(bh);
-                put_bh(bh);
+        submit_bh(READ, bh);
+        return bh;
+}
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+                           struct buffer_head *bh)
+{
+        struct ext4_group_desc *desc;
+        if (!buffer_new(bh))
+                return 0;
+        desc = ext4_get_group_desc(sb, block_group, NULL);
+        if (!desc)
+                return 1;
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
                ext4_error(sb, "Cannot read block bitmap - "
-                            "block_group = %u, block_bitmap = %llu",
+                           "block_group = %u, block_bitmap = %llu",
-                            block_group, bitmap_blk);
+                           block_group, (unsigned long long) bh->b_blocknr);
-                return NULL;
+                return 1;
        }
+        clear_buffer_new(bh);
+        /* Panic or remount fs read-only if block bitmap is invalid */
        ext4_valid_block_bitmap(sb, desc, block_group, bh);
-        /*
+        return 0;
-         * file system mounted not to panic on error,
+}
-         * continue with corrupt bitmap
-         */
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+        struct buffer_head *bh;
+        bh = ext4_read_block_bitmap_nowait(sb, block_group);
+        if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+                put_bh(bh);
+                return NULL;
+        }
        return bh;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c56092e58..ad56866d729a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
                return 0;
        if (filp)
-                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
+                ext4_error_file(filp, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                                error_msg, (unsigned) (offset%bh->b_size),
+                                error_msg, (unsigned) (offset % bh->b_size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
        else
-                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                ext4_error_inode(dir, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                                error_msg, (unsigned) (offset%bh->b_size),
+                                error_msg, (unsigned) (offset % bh->b_size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
@@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent,
        sb = inode->i_sb;
        if (!fname) {
-                printk(KERN_ERR "EXT4-fs: call_filldir: called with "
+                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
-                       "null fname?!?\n");
+                         "called with null fname?!?", __func__, __LINE__,
+                         inode->i_ino, current->comm);
                return 0;
        }
        curr_pos = hash2pos(fname->hash, fname->minor_hash);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004fc3d84..ded731ac8a32 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,7 +53,7 @@
                printk(KERN_DEBUG f, ## a);                             \
        } while (0)
 #else
-#define ext4_debug(f, a...)     do {} while (0)
+#define ext4_debug(fmt, ...)    no_printk(fmt, ##__VA_ARGS__)
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -184,6 +184,8 @@ struct mpage_da_data {
 #define EXT4_IO_END_UNWRITTEN   0x0001
 #define EXT4_IO_END_ERROR       0x0002
 #define EXT4_IO_END_QUEUED      0x0004
+#define EXT4_IO_END_DIRECT      0x0008
+#define EXT4_IO_END_IN_FSYNC    0x0010
 struct ext4_io_page {
        struct page     *p_page;
@@ -192,18 +194,25 @@ struct ext4_io_page {
 #define MAX_IO_PAGES 128
+/*
+ * For converting uninitialized extents on a work queue.
+ *
+ * 'page' is only used from the writepage() path; 'pages' is only used for
+ * buffered writes; they are used to keep page references until conversion
+ * takes place.  For AIO/DIO, neither field is filled in.
+ */
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
        unsigned int            flag;           /* unwritten or not */
-        struct page             *page;          /* page struct for buffer write */
+        struct page             *page;          /* for writepage() path */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
        struct work_struct      work;           /* data work queue */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
-        int                     num_io_pages;
+        int                     num_io_pages;   /* for writepages() */
-        struct ext4_io_page     *pages[MAX_IO_PAGES];
+        struct ext4_io_page     *pages[MAX_IO_PAGES]; /* for writepages() */
 } ext4_io_end_t;
 struct ext4_io_submit {
@@ -923,6 +932,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_CONT          0x00010 /* Continue on errors */
 #define EXT4_MOUNT_ERRORS_RO            0x00020 /* Remount fs ro on errors */
 #define EXT4_MOUNT_ERRORS_PANIC         0x00040 /* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK          0x00070
 #define EXT4_MOUNT_MINIX_DF             0x00080 /* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD               0x00100 /* Don't use existing journal*/
 #define EXT4_MOUNT_DATA_FLAGS           0x00C00 /* Mode for data writes: */
@@ -941,7 +951,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DIOREAD_NOLOCK       0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBLK_IO_SUBMIT       0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
@@ -1142,6 +1151,7 @@ struct ext4_sb_info {
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
+        unsigned int s_def_mount_opt;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
        gid_t s_resgid;
@@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG           0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE          0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA        0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM  0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR          0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA        0x8000 /* data in inode */
 #define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-                                      ext4_group_t block_group);
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+                                                ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+                                  ext4_group_t block_group,
+                                  struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+                                                  ext4_group_t block_group);
 extern void ext4_init_block_bitmap(struct super_block *sb,
                                   struct buffer_head *bh,
                                   ext4_group_t group,
@@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *);
 extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 /* mballoc.c */
 extern long ext4_mb_stats;
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a52db3a69a30..0f58b86e3a02 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -47,9 +47,9 @@
 */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
-#define ext_debug(a...)         printk(a)
+#define ext_debug(fmt, ...)     printk(fmt, ##__VA_ARGS__)
 #else
-#define ext_debug(a...)
+#define ext_debug(fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
 #endif
 /*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1dab18..83b20fcf9400 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@
 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+/**
+ *   struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ *   This struct is a 'seed' structure for a using with your own callback
+ *   structs. If you are using callbacks you must allocate one of these
+ *   or another struct of your own definition which has this struct
+ *   as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+        /* list information for other callbacks attached to the same handle */
+        struct list_head jce_list;
+        /*  Function to call with this callback structure */
+        void (*jce_func)(struct super_block *sb,
+                         struct ext4_journal_cb_entry *jce, int error);
+        /* user data goes here */
+};
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ *        @sb: superblock of current filesystem for transaction
+ *        @jce: returned journal callback data
+ *        @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+                        void (*func)(struct super_block *sb,
+                                     struct ext4_journal_cb_entry *jce,
+                                     int rc),
+                        struct ext4_journal_cb_entry *jce)
+{
+        struct ext4_sb_info *sbi =
+                        EXT4_SB(handle->h_transaction->t_journal->j_private);
+        /* Add the jce to transaction's private list */
+        jce->jce_func = func;
+        spin_lock(&sbi->s_md_lock);
+        list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+        spin_unlock(&sbi->s_md_lock);
+}
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ */
+static inline void ext4_journal_callback_del(handle_t *handle,
+                                             struct ext4_journal_cb_entry *jce)
+{
+        struct ext4_sb_info *sbi =
+                        EXT4_SB(handle->h_transaction->t_journal->j_private);
+        spin_lock(&sbi->s_md_lock);
+        list_del_init(&jce->jce_list);
+        spin_unlock(&sbi->s_md_lock);
+}
 int
 ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
@@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
-static inline int ext4_should_journal_data(struct inode *inode)
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE    0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE    0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE  0x04 /* writeback data mode */
+static inline int ext4_inode_journal_mode(struct inode *inode)
 {
        if (EXT4_JOURNAL(inode) == NULL)
-                return 0;
+                return EXT4_INODE_WRITEBACK_DATA_MODE;  /* writeback */
-        if (!S_ISREG(inode->i_mode))
+        /* We do not support data journalling with delayed allocation */
-                return 1;
+        if (!S_ISREG(inode->i_mode) ||
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+            test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-                return 1;
+                return EXT4_INODE_JOURNAL_DATA_MODE;    /* journal data */
-        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
-                return 1;
+            !test_opt(inode->i_sb, DELALLOC))
-        return 0;
+                return EXT4_INODE_JOURNAL_DATA_MODE;    /* journal data */
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                return EXT4_INODE_ORDERED_DATA_MODE;    /* ordered */
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+                return EXT4_INODE_WRITEBACK_DATA_MODE;  /* writeback */
+        else
+                BUG();
+}
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
 }
 static inline int ext4_should_order_data(struct inode *inode)
 {
-        if (EXT4_JOURNAL(inode) == NULL)
+        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
-                return 0;
-        if (!S_ISREG(inode->i_mode))
-                return 0;
-        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-                return 0;
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-                return 1;
-        return 0;
 }
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-        if (EXT4_JOURNAL(inode) == NULL)
+        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
-                return 1;
-        if (!S_ISREG(inode->i_mode))
-                return 0;
-        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-                return 0;
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-                return 1;
-        return 0;
 }
 /*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c292e1b..1421938e6792 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,14 @@
 #include <trace/events/ext4.h>
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
+                                        due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
 static int ext4_split_extent(handle_t *handle,
                                struct inode *inode,
                                struct ext4_ext_path *path,
@@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle,
                                int split_flag,
                                int flags);
+static int ext4_split_extent_at(handle_t *handle,
+                             struct inode *inode,
+                             struct ext4_ext_path *path,
+                             ext4_lblk_t split,
+                             int split_flag,
+                             int flags);
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
+        if (len == 0)
+                return 0;
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
@@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        struct ext4_extent *ex;
        /* the header must be checked already in ext4_ext_remove_space() */
-        ext_debug("truncate since %u in leaf\n", start);
+        ext_debug("truncate since %u in leaf to %u\n", start, end);
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
@@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                ext_debug("  border %u:%u\n", a, b);
                /* If this extent is beyond the end of the hole, skip it */
-                if (end <= ex_ee_block) {
+                if (end < ex_ee_block) {
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
-                        EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
+                        EXT4_ERROR_INODE(inode,
-                                         start, end);
+                                         "can not handle truncate %u:%u "
+                                         "on extent %u:%u",
+                                         start, end, ex_ee_block,
+                                         ex_ee_block + ex_ee_len - 1);
                        err = -EIO;
                        goto out;
                } else if (a != ex_ee_block) {
@@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                                 ext4_lblk_t end)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        handle_t *handle;
        int i, err;
-        ext_debug("truncate since %u\n", start);
+        ext_debug("truncate since %u to %u\n", start, end);
        /* probably first extent we're gonna free will be last in block */
        handle = ext4_journal_start(inode, depth + 1);
@@ -2504,6 +2525,61 @@ again:
        trace_ext4_ext_remove_space(inode, start, depth);
        /*
+         * Check if we are removing extents inside the extent tree. If that
+         * is the case, we are going to punch a hole inside the extent tree
+         * so we have to check whether we need to split the extent covering
+         * the last block to remove so we can easily remove the part of it
+         * in ext4_ext_rm_leaf().
+         */
+        if (end < EXT_MAX_BLOCKS - 1) {
+                struct ext4_extent *ex;
+                ext4_lblk_t ee_block;
+                /* find extent for this block */
+                path = ext4_ext_find_extent(inode, end, NULL);
+                if (IS_ERR(path)) {
+                        ext4_journal_stop(handle);
+                        return PTR_ERR(path);
+                }
+                depth = ext_depth(inode);
+                ex = path[depth].p_ext;
+                if (!ex)
+                        goto cont;
+                ee_block = le32_to_cpu(ex->ee_block);
+                /*
+                 * See if the last block is inside the extent, if so split
+                 * the extent at 'end' block so we can easily remove the
+                 * tail of the first part of the split extent in
+                 * ext4_ext_rm_leaf().
+                 */
+                if (end >= ee_block &&
+                    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+                        int split_flag = 0;
+                        if (ext4_ext_is_uninitialized(ex))
+                                split_flag = EXT4_EXT_MARK_UNINIT1 |
+                                             EXT4_EXT_MARK_UNINIT2;
+                        /*
+                         * Split the extent in two so that 'end' is the last
+                         * block in the first new extent
+                         */
+                        err = ext4_split_extent_at(handle, inode, path,
+                                                end + 1, split_flag,
+                                                EXT4_GET_BLOCKS_PRE_IO |
+                                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                        if (err < 0)
+                                goto out;
+                }
+                ext4_ext_drop_refs(path);
+                kfree(path);
+        }
+cont:
+        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
@@ -2515,6 +2591,7 @@ again:
        }
        path[0].p_depth = depth;
        path[0].p_hdr = ext_inode_hdr(inode);
        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
@@ -2526,7 +2603,7 @@ again:
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
                                               &partial_cluster, start,
-                                               EXT_MAX_BLOCKS - 1);
+                                               end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb)
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
-                printk(KERN_INFO "EXT4-fs: file extents enabled");
+                printk(KERN_INFO "EXT4-fs: file extents enabled"
 #ifdef AGGRESSIVE_TEST
-                printk(", aggressive tests");
+                       ", aggressive tests"
 #endif
 #ifdef CHECK_BINSEARCH
-                printk(", check binsearch");
+                       ", check binsearch"
 #endif
 #ifdef EXTENTS_STATS
-                printk(", stats");
+                       ", stats"
 #endif
-                printk("\n");
+                       "\n");
 #endif
 #ifdef EXTENTS_STATS
                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 }
 /*
- * used by extent splitting.
- */
-#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
-                                        due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
-/*
 * ext4_split_extent_at() splits an extent at given block.
 *
 * @handle: the journal handle
@@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
-        if (unlikely(!eh->eh_entries)) {
+        /*
-                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+         * We're going to remove EOFBLOCKS_FL entirely in future so we
-                                 "EOFBLOCKS_FL set");
+         * do not care for this case anymore. Simply remove the flag
-                return -EIO;
+         * if there are no extents.
-        }
+         */
+        if (unlikely(!eh->eh_entries))
+                goto out;
        last_ex = EXT_LAST_EXTENT(eh);
        /*
         * We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
        for (i = depth-1; i >= 0; i--)
                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
                        return 0;
+out:
        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        return ext4_mark_inode_dirty(handle, inode);
 }
@@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        int free_on_err = 0, err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
-        unsigned int punched_out = 0;
-        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
        ext4_lblk_t cluster_offset;
@@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
-        if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-                ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((sbi->s_cluster_ratio > 1) &&
                            ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
@@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
-                        struct ext4_map_blocks punch_map;
-                        ext4_fsblk_t partial_cluster = 0;
                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (map->m_lblk - ee_block);
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
-                        if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
-                                /*
-                                 * Do not put uninitialized extent
-                                 * in the cache
-                                 */
-                                if (!ext4_ext_is_uninitialized(ex)) {
-                                        ext4_ext_put_in_cache(inode, ee_block,
-                                                ee_len, ee_start);
-                                        goto out;
-                                }
-                                ret = ext4_ext_handle_uninitialized_extents(
-                                        handle, inode, map, path, flags,
-                                        allocated, newblock);
-                                return ret;
-                        }
-                        /*
-                         * Punch out the map length, but only to the
-                         * end of the extent
-                         */
-                        punched_out = allocated < map->m_len ?
-                                allocated : map->m_len;
                        /*
-                         * Sense extents need to be converted to
+                         * Do not put uninitialized extent
-                         * uninitialized, they must fit in an
+                         * in the cache
-                         * uninitialized extent
                         */
-                        if (punched_out > EXT_UNINIT_MAX_LEN)
+                        if (!ext4_ext_is_uninitialized(ex)) {
-                                punched_out = EXT_UNINIT_MAX_LEN;
+                                ext4_ext_put_in_cache(inode, ee_block,
+                                        ee_len, ee_start);
-                        punch_map.m_lblk = map->m_lblk;
+                                goto out;
-                        punch_map.m_pblk = newblock;
-                        punch_map.m_len = punched_out;
-                        punch_map.m_flags = 0;
-                        /* Check to see if the extent needs to be split */
-                        if (punch_map.m_len != ee_len ||
-                                punch_map.m_lblk != ee_block) {
-                                ret = ext4_split_extent(handle, inode,
-                                path, &punch_map, 0,
-                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-                                EXT4_GET_BLOCKS_PRE_IO);
-                                if (ret < 0) {
-                                        err = ret;
-                                        goto out2;
-                                }
-                                /*
-                                 * find extent for the block at
-                                 * the start of the hole
-                                 */
-                                ext4_ext_drop_refs(path);
-                                kfree(path);
-                                path = ext4_ext_find_extent(inode,
-                                map->m_lblk, NULL);
-                                if (IS_ERR(path)) {
-                                        err = PTR_ERR(path);
-                                        path = NULL;
-                                        goto out2;
-                                }
-                                depth = ext_depth(inode);
-                                ex = path[depth].p_ext;
-                                ee_len = ext4_ext_get_actual_len(ex);
-                                ee_block = le32_to_cpu(ex->ee_block);
-                                ee_start = ext4_ext_pblock(ex);
-                        }
-                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_invalidate_cache(inode);
-                        err = ext4_ext_rm_leaf(handle, inode, path,
-                                               &partial_cluster, map->m_lblk,
-                                               map->m_lblk + punched_out);
-                        if (!err && path->p_hdr->eh_entries == 0) {
-                                /*
-                                 * Punch hole freed all of this sub tree,
-                                 * so we need to correct eh_depth
-                                 */
-                                err = ext4_ext_get_access(handle, inode, path);
-                                if (err == 0) {
-                                        ext_inode_hdr(inode)->eh_depth = 0;
-                                        ext_inode_hdr(inode)->eh_max =
-                                        cpu_to_le16(ext4_ext_space_root(
-                                                inode, 0));
-                                        err = ext4_ext_dirty(
-                                                handle, inode, path);
-                                }
                        }
+                        ret = ext4_ext_handle_uninitialized_extents(
-                        goto out2;
+                                handle, inode, map, path, flags,
+                                allocated, newblock);
+                        return ret;
                }
        }
@@ -4165,13 +4146,11 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
-        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
-                        punched_out : allocated;
        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-                newblock, map->m_len, err ? err : result);
+                newblock, map->m_len, err ? err : allocated);
-        return err ? err : result;
+        return err ? err : allocated;
 }
 void ext4_ext_truncate(struct inode *inode)
@@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
-        err = ext4_ext_remove_space(inode, last_block);
+        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        /* In a multi-transaction truncate, we only make the final
         * transaction synchronous.
@@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
+                        ext4_msg(inode->i_sb, KERN_ERR,
-                                    "returned error inode#%lu, block=%u, "
+                                 "%s:%d: inode #%lu: block %u: len %u: "
-                                    "max_blocks=%u", __func__,
+                                 "ext4_ext_map_blocks returned %d",
-                                    inode->i_ino, map.m_lblk, map.m_len);
+                                 __func__, __LINE__, inode->i_ino, map.m_lblk,
+                                 map.m_len, ret);
                }
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
@@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        struct super_block *sb = inode->i_sb;
-        struct ext4_ext_cache cache_ex;
+        ext4_lblk_t first_block, stop_block;
-        ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
        struct address_space *mapping = inode->i_mapping;
-        struct ext4_map_blocks map;
        handle_t *handle;
        loff_t first_page, last_page, page_len;
        loff_t first_page_offset, last_page_offset;
-        int ret, credits, blocks_released, err = 0;
+        int credits, err = 0;
        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
@@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                   offset;
        }
-        first_block = (offset + sb->s_blocksize - 1) >>
-                EXT4_BLOCK_SIZE_BITS(sb);
-        last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
@@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                }
        }
        /*
         * If i_size is contained in the last page, we need to
         * unmap and zero the partial page after i_size
@@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                }
        }
+        first_block = (offset + sb->s_blocksize - 1) >>
+                EXT4_BLOCK_SIZE_BITS(sb);
+        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
        /* If there are no blocks to remove, return now */
-        if (first_block >= last_block)
+        if (first_block >= stop_block)
                goto out;
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
        ext4_discard_preallocations(inode);
-        /*
+        err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
-         * Loop over all the blocks and identify blocks
-         * that need to be punched out
-         */
-        iblock = first_block;
-        blocks_released = 0;
-        while (iblock < last_block) {
-                max_blocks = last_block - iblock;
-                num_blocks = 1;
-                memset(&map, 0, sizeof(map));
-                map.m_lblk = iblock;
-                map.m_len = max_blocks;
-                ret = ext4_ext_map_blocks(handle, inode, &map,
-                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
-                if (ret > 0) {
-                        blocks_released += ret;
-                        num_blocks = ret;
-                } else if (ret == 0) {
-                        /*
-                         * If map blocks could not find the block,
-                         * then it is in a hole.  If the hole was
-                         * not already cached, then map blocks should
-                         * put it in the cache.  So we can get the hole
-                         * out of the cache
-                         */
-                        memset(&cache_ex, 0, sizeof(cache_ex));
-                        if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
-                                !cache_ex.ec_start) {
-                                /* The hole is cached */
-                                num_blocks = cache_ex.ec_block +
-                                cache_ex.ec_len - iblock;
-                        } else {
-                                /* The block could not be identified */
-                                err = -EIO;
-                                break;
-                        }
-                } else {
-                        /* Map blocks error */
-                        err = ret;
-                        break;
-                }
-                if (num_blocks == 0) {
-                        /* This condition should never happen */
-                        ext_debug("Block lookup failed");
-                        err = -EIO;
-                        break;
-                }
-                iblock += num_blocks;
-        }
-        if (blocks_released > 0) {
+        ext4_ext_invalidate_cache(inode);
-                ext4_ext_invalidate_cache(inode);
+        ext4_discard_preallocations(inode);
-                ext4_discard_preallocations(inode);
-        }
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb753efd..bb6c7d811313 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)
                io = list_entry(ei->i_completed_io_list.next,
                                ext4_io_end_t, list);
                list_del_init(&io->list);
+                io->flag |= EXT4_IO_END_IN_FSYNC;
                /*
                 * Calling ext4_end_io_nolock() to convert completed
                 * IO to written.
@@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)
                if (ret < 0)
                        ret2 = ret;
                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+                io->flag &= ~EXT4_IO_END_IN_FSYNC;
        }
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25d8c9781ad9..409c2ee7750a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
        return EXT4_INODES_PER_GROUP(sb);
 }
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+        if (uptodate) {
+                set_buffer_uptodate(bh);
+                set_bitmap_uptodate(bh);
+        }
+        unlock_buffer(bh);
+        put_bh(bh);
+}
 /*
 * Read the inode allocation bitmap for a given block_group, reading
 * into the specified slot in the superblock's bitmap cache.
@@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        /*
-         * submit the buffer_head for read. We can
+         * submit the buffer_head for reading
-         * safely mark the bitmap as uptodate now.
-         * We do it here so the bitmap uptodate bit
-         * get set with buffer lock held.
         */
        trace_ext4_load_inode_bitmap(sb, block_group);
-        set_bitmap_uptodate(bh);
+        bh->b_end_io = ext4_end_bitmap_read;
-        if (bh_submit_read(bh) < 0) {
+        get_bh(bh);
+        submit_bh(READ, bh);
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
                put_bh(bh);
                ext4_error(sb, "Cannot read inode bitmap - "
-                            "block_group = %u, inode_bitmap = %llu",
+                           "block_group = %u, inode_bitmap = %llu",
-                            block_group, bitmap_blk);
+                           block_group, bitmap_blk);
                return NULL;
        }
        return bh;
@@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
-        if (atomic_read(&inode->i_count) > 1) {
+        if (!sb) {
-                printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
+                printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
-                       atomic_read(&inode->i_count));
+                       "nonexistent device\n", __func__, __LINE__);
                return;
        }
-        if (inode->i_nlink) {
+        if (atomic_read(&inode->i_count) > 1) {
-                printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
+                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
-                       inode->i_nlink);
+                         __func__, __LINE__, inode->i_ino,
+                         atomic_read(&inode->i_count));
                return;
        }
-        if (!sb) {
+        if (inode->i_nlink) {
-                printk(KERN_ERR "ext4_free_inode: inode on "
+                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
-                       "nonexistent device\n");
+                         __func__, __LINE__, inode->i_ino, inode->i_nlink);
                return;
        }
        sbi = EXT4_SB(sb);
@@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 /*
- * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's ext4_group_lock
- * and clear the uninit flag. The inode bitmap update
- * and group desc uninit flag clear should be done
- * after holding ext4_group_lock so that ext4_read_inode_bitmap
- * doesn't race with the ext4_claim_inode
- */
-static int ext4_claim_inode(struct super_block *sb,
-                        struct buffer_head *inode_bitmap_bh,
-                        unsigned long ino, ext4_group_t group, umode_t mode)
-{
-        int free = 0, retval = 0, count;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
-        /*
-         * We have to be sure that new inode allocation does not race with
-         * inode table initialization, because otherwise we may end up
-         * allocating and writing new inode right before sb_issue_zeroout
-         * takes place and overwriting our new inode with zeroes. So we
-         * take alloc_sem to prevent it.
-         */
-        down_read(&grp->alloc_sem);
-        ext4_lock_group(sb, group);
-        if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
-                /* not a free inode */
-                retval = 1;
-                goto err_ret;
-        }
-        ino++;
-        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-                        ino > EXT4_INODES_PER_GROUP(sb)) {
-                ext4_unlock_group(sb, group);
-                up_read(&grp->alloc_sem);
-                ext4_error(sb, "reserved inode or inode > inodes count - "
-                           "block_group = %u, inode=%lu", group,
-                           ino + group * EXT4_INODES_PER_GROUP(sb));
-                return 1;
-        }
-        /* If we didn't allocate from within the initialized part of the inode
-         * table then we need to initialize up to this inode. */
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-                        /* When marking the block group with
-                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
-                         * on the value of bg_itable_unused even though
-                         * mke2fs could have initialized the same for us.
-                         * Instead we calculated the value below
-                         */
-                        free = 0;
-                } else {
-                        free = EXT4_INODES_PER_GROUP(sb) -
-                                ext4_itable_unused_count(sb, gdp);
-                }
-                /*
-                 * Check the relative inode number against the last used
-                 * relative inode number in this group. if it is greater
-                 * we need to  update the bg_itable_unused count
-                 *
-                 */
-                if (ino > free)
-                        ext4_itable_unused_set(sb, gdp,
-                                        (EXT4_INODES_PER_GROUP(sb) - ino));
-        }
-        count = ext4_free_inodes_count(sb, gdp) - 1;
-        ext4_free_inodes_set(sb, gdp, count);
-        if (S_ISDIR(mode)) {
-                count = ext4_used_dirs_count(sb, gdp) + 1;
-                ext4_used_dirs_set(sb, gdp, count);
-                if (sbi->s_log_groups_per_flex) {
-                        ext4_group_t f = ext4_flex_group(sbi, group);
-                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
-                }
-        }
-        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-err_ret:
-        ext4_unlock_group(sb, group);
-        up_read(&grp->alloc_sem);
-        return retval;
-}
-/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +664,11 @@ got_group:
        if (ret2 == -1)
                goto out;
+        /*
+         * Normally we will only go through one pass of this loop,
+         * unless we get unlucky and it turns out the group we selected
+         * had its last inode grabbed by someone else.
+         */
        for (i = 0; i < ngroups; i++, ino = 0) {
                err = -EIO;
@@ -757,51 +685,24 @@ repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
                                              inode_bitmap_bh->b_data,
                                              EXT4_INODES_PER_GROUP(sb), ino);
+                if (ino >= EXT4_INODES_PER_GROUP(sb)) {
-                if (ino < EXT4_INODES_PER_GROUP(sb)) {
+                        if (++group == ngroups)
+                                group = 0;
-                        BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+                        continue;
-                        err = ext4_journal_get_write_access(handle,
-                                                            inode_bitmap_bh);
-                        if (err)
-                                goto fail;
-                        BUFFER_TRACE(group_desc_bh, "get_write_access");
-                        err = ext4_journal_get_write_access(handle,
-                                                                group_desc_bh);
-                        if (err)
-                                goto fail;
-                        if (!ext4_claim_inode(sb, inode_bitmap_bh,
-                                                ino, group, mode)) {
-                                /* we won it */
-                                BUFFER_TRACE(inode_bitmap_bh,
-                                        "call ext4_handle_dirty_metadata");
-                                err = ext4_handle_dirty_metadata(handle,
-                                                                 NULL,
-                                                        inode_bitmap_bh);
-                                if (err)
-                                        goto fail;
-                                /* zero bit is inode number 1*/
-                                ino++;
-                                goto got;
-                        }
-                        /* we lost it */
-                        ext4_handle_release_buffer(handle, inode_bitmap_bh);
-                        ext4_handle_release_buffer(handle, group_desc_bh);
-                        if (++ino < EXT4_INODES_PER_GROUP(sb))
-                                goto repeat_in_this_group;
                }
+                if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
-                /*
+                        ext4_error(sb, "reserved inode found cleared - "
-                 * This case is possible in concurrent environment.  It is very
+                                   "inode=%lu", ino + 1);
-                 * rare.  We cannot repeat the find_group_xxx() call because
+                        continue;
-                 * that will simply return the same blockgroup, because the
+                }
-                 * group descriptor metadata has not yet been updated.
+                ext4_lock_group(sb, group);
-                 * So we just go onto the next blockgroup.
+                ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
-                 */
+                ext4_unlock_group(sb, group);
-                if (++group == ngroups)
+                ino++;          /* the inode bitmap is zero-based */
-                        group = 0;
+                if (!ret2)
+                        goto got; /* we grabbed the inode! */
+                if (ino < EXT4_INODES_PER_GROUP(sb))
+                        goto repeat_in_this_group;
        }
        err = -ENOSPC;
        goto out;
@@ -838,6 +739,59 @@ got:
                if (err)
                        goto fail;
        }
+        BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+        if (err)
+                goto fail;
+        BUFFER_TRACE(group_desc_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, group_desc_bh);
+        if (err)
+                goto fail;
+        /* Update the relevant bg descriptor fields */
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+                int free;
+                struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+                down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+                ext4_lock_group(sb, group); /* while we modify the bg desc */
+                free = EXT4_INODES_PER_GROUP(sb) -
+                        ext4_itable_unused_count(sb, gdp);
+                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+                        free = 0;
+                }
+                /*
+                 * Check the relative inode number against the last used
+                 * relative inode number in this group. if it is greater
+                 * we need to update the bg_itable_unused count
+                 */
+                if (ino > free)
+                        ext4_itable_unused_set(sb, gdp,
+                                        (EXT4_INODES_PER_GROUP(sb) - ino));
+                up_read(&grp->alloc_sem);
+        }
+        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+        if (S_ISDIR(mode)) {
+                ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+                if (sbi->s_log_groups_per_flex) {
+                        ext4_group_t f = ext4_flex_group(sbi, group);
+                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+                }
+        }
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+                gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+                ext4_unlock_group(sb, group);
+        }
+        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+        err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+        if (err)
+                goto fail;
        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
        if (err)
@@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 * where it is called from on active part of filesystem is ext4lazyinit
 * thread, so we do not need any special locks, however we have to prevent
 * inode allocation from the current group, so we take alloc_sem lock, to
- * block ext4_claim_inode until we are finished.
+ * block ext4_new_inode() until we are finished.
 */
 int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                                 int barrier)
@@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                            sbi->s_inodes_per_block);
        if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
-                ext4_error(sb, "Something is wrong with group %u\n"
+                ext4_error(sb, "Something is wrong with group %u: "
-                           "Used itable blocks: %d"
+                           "used itable blocks: %d; "
-                           "itable unused count: %u\n",
+                           "itable unused count: %u",
                           group, used_blks,
                           ext4_itable_unused_count(sb, gdp));
                ret = 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82fe629d..c77b0bd2c711 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
-                         "with only %d reserved data blocks\n",
+                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
@@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 */
                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
-                         "data blocks\n", inode->i_ino, to_free,
+                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                to_free = ei->i_reserved_data_blocks;
@@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        printk(KERN_CRIT "Total free blocks count %lld\n",
+        struct super_block *sb = inode->i_sb;
+        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
                        ext4_count_free_clusters(inode->i_sb)));
-        printk(KERN_CRIT "Free/Dirty block details\n");
+        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
-        printk(KERN_CRIT "free_blocks=%lld\n",
+        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
-        printk(KERN_CRIT "dirty_blocks=%lld\n",
+        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-        printk(KERN_CRIT "Block reservation details\n");
+        ext4_msg(sb, KERN_CRIT, "Block reservation details");
-        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
-               EXT4_I(inode)->i_reserved_data_blocks);
+                 EXT4_I(inode)->i_reserved_data_blocks);
-        printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+        ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
               EXT4_I(inode)->i_reserved_meta_blocks);
        return;
 }
@@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file,
        int write_mode = (int)(unsigned long)fsdata;
        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
-                if (ext4_should_order_data(inode)) {
+                switch (ext4_inode_journal_mode(inode)) {
+                case EXT4_INODE_ORDERED_DATA_MODE:
                        return ext4_ordered_write_end(file, mapping, pos,
                                        len, copied, page, fsdata);
-                } else if (ext4_should_writeback_data(inode)) {
+                case EXT4_INODE_WRITEBACK_DATA_MODE:
                        return ext4_writeback_write_end(file, mapping, pos,
                                        len, copied, page, fsdata);
-                } else {
+                default:
                        BUG();
                }
        }
@@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                goto out;
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
-                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
                  iocb->private, io_end->inode->i_ino, iocb, offset,
                  size);
@@ -2795,9 +2798,6 @@ out:
        /* queue the work to convert unwritten extents to written */
        queue_work(wq, &io_end->work);
-        /* XXX: probably should move into the real I/O completion handler */
-        inode_dio_done(inode);
 }
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
-                printk("sb umounted, discard end_io request for inode %lu\n",
+                ext4_msg(io_end->inode->i_sb, KERN_INFO,
-                        io_end->inode->i_ino);
+                         "sb umounted, discard end_io request for inode %lu",
+                         io_end->inode->i_ino);
                ext4_free_io_end(io_end);
                goto out;
        }
@@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                iocb->private = NULL;
                EXT4_I(inode)->cur_aio_dio = NULL;
                if (!is_sync_kiocb(iocb)) {
-                        iocb->private = ext4_init_io_end(inode, GFP_NOFS);
+                        ext4_io_end_t *io_end =
-                        if (!iocb->private)
+                                ext4_init_io_end(inode, GFP_NOFS);
+                        if (!io_end)
                                return -ENOMEM;
+                        io_end->flag |= EXT4_IO_END_DIRECT;
+                        iocb->private = io_end;
                        /*
                         * we save the io structure for current async
                         * direct IO, so that later ext4_map_blocks()
@@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                         ext4_get_block_write,
                                         ext4_end_io_dio,
                                         NULL,
-                                         DIO_LOCKING | DIO_SKIP_HOLES);
+                                         DIO_LOCKING);
                if (iocb->private)
                        EXT4_I(inode)->cur_aio_dio = NULL;
                /*
@@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = {
 void ext4_set_aops(struct inode *inode)
 {
-        if (ext4_should_order_data(inode) &&
+        switch (ext4_inode_journal_mode(inode)) {
-                test_opt(inode->i_sb, DELALLOC))
+        case EXT4_INODE_ORDERED_DATA_MODE:
-                inode->i_mapping->a_ops = &ext4_da_aops;
+                if (test_opt(inode->i_sb, DELALLOC))
-        else if (ext4_should_order_data(inode))
+                        inode->i_mapping->a_ops = &ext4_da_aops;
-                inode->i_mapping->a_ops = &ext4_ordered_aops;
+                else
-        else if (ext4_should_writeback_data(inode) &&
+                        inode->i_mapping->a_ops = &ext4_ordered_aops;
-                 test_opt(inode->i_sb, DELALLOC))
+                break;
-                inode->i_mapping->a_ops = &ext4_da_aops;
+        case EXT4_INODE_WRITEBACK_DATA_MODE:
-        else if (ext4_should_writeback_data(inode))
+                if (test_opt(inode->i_sb, DELALLOC))
-                inode->i_mapping->a_ops = &ext4_writeback_aops;
+                        inode->i_mapping->a_ops = &ext4_da_aops;
-        else
+                else
+                        inode->i_mapping->a_ops = &ext4_writeback_aops;
+                break;
+        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
+                break;
+        default:
+                BUG();
+        }
 }
@@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        if (!S_ISREG(inode->i_mode))
-                return -ENOTSUPP;
+                return -EOPNOTSUPP;
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                /* TODO: Add support for non extent hole punching */
-                return -ENOTSUPP;
+                return -EOPNOTSUPP;
        }
        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
                /* TODO: Add support for bigalloc file systems */
-                return -ENOTSUPP;
+                return -EOPNOTSUPP;
        }
        return ext4_ext_punch_hole(file, offset, length);
@@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle,
                        ext4_update_dynamic_rev(sb);
                        EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
-                        sb->s_dirt = 1;
                        ext4_handle_sync(handle);
-                        err = ext4_handle_dirty_metadata(handle, NULL,
+                        err = ext4_handle_dirty_super(handle, sb);
-                                        EXT4_SB(sb)->s_sbh);
                }
        }
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (attr->ia_valid & ATTR_SIZE) {
-                if (attr->ia_size != i_size_read(inode)) {
+                if (attr->ia_size != i_size_read(inode))
                        truncate_setsize(inode, attr->ia_size);
-                        ext4_truncate(inode);
+                ext4_truncate(inode);
-                } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
-                        ext4_truncate(inode);
        }
        if (!rc) {
@@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 {
        int err = 0;
-        if (test_opt(inode->i_sb, I_VERSION))
+        if (IS_I_VERSION(inode))
                inode_inc_iversion(inode);
        /* the do_update_inode consumes one bh->b_count */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cb990b21c698..99ab428bcfa0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@
 * mballoc.c contains the multiblocks allocation routines
 */
+#include "ext4_jbd2.h"
 #include "mballoc.h"
 #include <linux/debugfs.h>
 #include <linux/slab.h>
@@ -339,7 +340,7 @@
 */
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
 /* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
@@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+                                struct ext4_journal_cb_entry *jce, int rc);
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
        char *bb;
-        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(max == NULL);
        if (order > e4b->bd_blkbits + 1) {
@@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
        /* at order 0 we see each particular block */
        if (order == 0) {
                *max = 1 << (e4b->bd_blkbits + 3);
-                return EXT4_MB_BITMAP(e4b);
+                return e4b->bd_bitmap;
        }
-        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
        return bb;
@@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                        for (j = 0; j < (1 << order); j++) {
                                k = (i * (1 << order)) + j;
                                MB_CHECK_ASSERT(
-                                        !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+                                        !mb_test_bit(k, e4b->bd_bitmap));
                        }
                        count++;
                }
@@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        int groups_per_page;
        int err = 0;
        int i;
-        ext4_group_t first_group;
+        ext4_group_t first_group, group;
        int first_block;
        struct super_block *sb;
        struct buffer_head *bhs;
@@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        /* allocate buffer_heads to read bitmaps */
        if (groups_per_page > 1) {
-                err = -ENOMEM;
                i = sizeof(struct buffer_head *) * groups_per_page;
                bh = kzalloc(i, GFP_NOFS);
-                if (bh == NULL)
+                if (bh == NULL) {
+                        err = -ENOMEM;
                        goto out;
+                }
        } else
                bh = &bhs;
        first_group = page->index * blocks_per_page / 2;
        /* read all groups the page covers into the cache */
-        for (i = 0; i < groups_per_page; i++) {
+        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
-                struct ext4_group_desc *desc;
+                if (group >= ngroups)
-                if (first_group + i >= ngroups)
                        break;
-                grinfo = ext4_get_group_info(sb, first_group + i);
+                grinfo = ext4_get_group_info(sb, group);
                /*
                 * If page is uptodate then we came here after online resize
                 * which added some new uninitialized group info structs, so
@@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        bh[i] = NULL;
                        continue;
                }
+                if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
-                err = -EIO;
+                        err = -ENOMEM;
-                desc = ext4_get_group_desc(sb, first_group + i, NULL);
-                if (desc == NULL)
-                        goto out;
-                err = -ENOMEM;
-                bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
-                if (bh[i] == NULL)
                        goto out;
-                if (bitmap_uptodate(bh[i]))
-                        continue;
-                lock_buffer(bh[i]);
-                if (bitmap_uptodate(bh[i])) {
-                        unlock_buffer(bh[i]);
-                        continue;
-                }
-                ext4_lock_group(sb, first_group + i);
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        ext4_init_block_bitmap(sb, bh[i],
-                                                first_group + i, desc);
-                        set_bitmap_uptodate(bh[i]);
-                        set_buffer_uptodate(bh[i]);
-                        ext4_unlock_group(sb, first_group + i);
-                        unlock_buffer(bh[i]);
-                        continue;
                }
-                ext4_unlock_group(sb, first_group + i);
+                mb_debug(1, "read bitmap for group %u\n", group);
-                if (buffer_uptodate(bh[i])) {
-                        /*
-                         * if not uninit if bh is uptodate,
-                         * bitmap is also uptodate
-                         */
-                        set_bitmap_uptodate(bh[i]);
-                        unlock_buffer(bh[i]);
-                        continue;
-                }
-                get_bh(bh[i]);
-                /*
-                 * submit the buffer_head for read. We can
-                 * safely mark the bitmap as uptodate now.
-                 * We do it here so the bitmap uptodate bit
-                 * get set with buffer lock held.
-                 */
-                set_bitmap_uptodate(bh[i]);
-                bh[i]->b_end_io = end_buffer_read_sync;
-                submit_bh(READ, bh[i]);
-                mb_debug(1, "read bitmap for group %u\n", first_group + i);
        }
        /* wait for I/O completion */
-        for (i = 0; i < groups_per_page; i++)
+        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
-                if (bh[i])
+                if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
-                        wait_on_buffer(bh[i]);
+                        err = -EIO;
-        err = -EIO;
-        for (i = 0; i < groups_per_page; i++)
-                if (bh[i] && !buffer_uptodate(bh[i]))
                        goto out;
+                }
+        }
-        err = 0;
        first_block = page->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
                int group;
@@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
        int order = 1;
        void *bb;
-        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
-        bb = EXT4_MB_BUDDY(e4b);
+        bb = e4b->bd_buddy;
        while (order <= e4b->bd_blkbits + 1) {
                block = block >> 1;
                if (!mb_test_bit(block, bb)) {
@@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        /* let's maintain fragments counter */
        if (first != 0)
-                block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+                block = !mb_test_bit(first - 1, e4b->bd_bitmap);
        if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
-                max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+                max = !mb_test_bit(first + count, e4b->bd_bitmap);
        if (block && max)
                e4b->bd_info->bb_fragments--;
        else if (!block && !max)
@@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                block = first++;
                order = 0;
-                if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+                if (!mb_test_bit(block, e4b->bd_bitmap)) {
                        ext4_fsblk_t blocknr;
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                                              "freeing already freed block "
                                              "(bit %u)", block);
                }
-                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+                mb_clear_bit(block, e4b->bd_bitmap);
                e4b->bd_info->bb_counters[order]++;
                /* start of the buddy */
@@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
                        break;
                next = (block + 1) * (1 << order);
-                if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+                if (mb_test_bit(next, e4b->bd_bitmap))
                        break;
                order = mb_find_order_for_block(e4b, next);
@@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        /* let's maintain fragments counter */
        if (start != 0)
-                mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
-                max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+                max = !mb_test_bit(start + len, e4b->bd_bitmap);
        if (mlen && max)
                e4b->bd_info->bb_fragments++;
        else if (!mlen && !max)
@@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        }
        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
-        ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
        mb_check_buddy(e4b);
        return ret;
@@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
-        void *bitmap = EXT4_MB_BITMAP(e4b);
+        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        int i;
        int free;
@@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 {
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        void *bitmap = EXT4_MB_BITMAP(e4b);
+        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        ext4_fsblk_t first_group_block;
        ext4_fsblk_t a;
@@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                meta_group_info = kmalloc(metalen, GFP_KERNEL);
                if (meta_group_info == NULL) {
-                        ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
+                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
                                 "for a buddy group");
                        goto exit_meta_group_info;
                }
@@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
-                ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
+                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                goto exit_group_info;
        }
        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
-        if (sbi->s_journal)
-                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
        return 0;
 out_free_locality_groups:
@@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb,
 * This function is called by the jbd2 layer once the commit has finished,
 * so we know we can free the blocks that were released with that commit.
 */
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+                                    struct ext4_journal_cb_entry *jce,
+                                    int rc)
 {
-        struct super_block *sb = journal->j_private;
+        struct ext4_free_data *entry = (struct ext4_free_data *)jce;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
        int err, count = 0, count2 = 0;
-        struct ext4_free_data *entry;
-        struct list_head *l, *ltmp;
-        list_for_each_safe(l, ltmp, &txn->t_private_list) {
+        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
-                entry = list_entry(l, struct ext4_free_data, list);
+                 entry->efd_count, entry->efd_group, entry);
-                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+        if (test_opt(sb, DISCARD))
-                         entry->count, entry->group, entry);
+                ext4_issue_discard(sb, entry->efd_group,
+                                   entry->efd_start_cluster, entry->efd_count);
-                if (test_opt(sb, DISCARD))
+        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
-                        ext4_issue_discard(sb, entry->group,
+        /* we expect to find existing buddy because it's pinned */
-                                           entry->start_cluster, entry->count);
+        BUG_ON(err != 0);
-                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
-                /* we expect to find existing buddy because it's pinned */
-                BUG_ON(err != 0);
-                db = e4b.bd_info;
+        db = e4b.bd_info;
-                /* there are blocks to put in buddy to make them really free */
+        /* there are blocks to put in buddy to make them really free */
-                count += entry->count;
+        count += entry->efd_count;
-                count2++;
+        count2++;
-                ext4_lock_group(sb, entry->group);
+        ext4_lock_group(sb, entry->efd_group);
-                /* Take it out of per group rb tree */
+        /* Take it out of per group rb tree */
-                rb_erase(&entry->node, &(db->bb_free_root));
+        rb_erase(&entry->efd_node, &(db->bb_free_root));
-                mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
+        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
-                /*
+        /*
-                 * Clear the trimmed flag for the group so that the next
+         * Clear the trimmed flag for the group so that the next
-                 * ext4_trim_fs can trim it.
+         * ext4_trim_fs can trim it.
-                 * If the volume is mounted with -o discard, online discard
+         * If the volume is mounted with -o discard, online discard
-                 * is supported and the free blocks will be trimmed online.
+         * is supported and the free blocks will be trimmed online.
-                 */
+         */
-                if (!test_opt(sb, DISCARD))
+        if (!test_opt(sb, DISCARD))
-                        EXT4_MB_GRP_CLEAR_TRIMMED(db);
+                EXT4_MB_GRP_CLEAR_TRIMMED(db);
-                if (!db->bb_free_root.rb_node) {
+        if (!db->bb_free_root.rb_node) {
-                        /* No more items in the per group rb tree
+                /* No more items in the per group rb tree
-                         * balance refcounts from ext4_mb_free_metadata()
+                 * balance refcounts from ext4_mb_free_metadata()
-                         */
+                 */
-                        page_cache_release(e4b.bd_buddy_page);
+                page_cache_release(e4b.bd_buddy_page);
-                        page_cache_release(e4b.bd_bitmap_page);
+                page_cache_release(e4b.bd_bitmap_page);
-                }
-                ext4_unlock_group(sb, entry->group);
-                kmem_cache_free(ext4_free_ext_cachep, entry);
-                ext4_mb_unload_buddy(&e4b);
        }
+        ext4_unlock_group(sb, entry->efd_group);
+        kmem_cache_free(ext4_free_data_cachep, entry);
+        ext4_mb_unload_buddy(&e4b);
        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
@@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void)
                return -ENOMEM;
        }
-        ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
-                                          SLAB_RECLAIM_ACCOUNT);
+                                           SLAB_RECLAIM_ACCOUNT);
-        if (ext4_free_ext_cachep == NULL) {
+        if (ext4_free_data_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
                return -ENOMEM;
@@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void)
        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
-        kmem_cache_destroy(ext4_free_ext_cachep);
+        kmem_cache_destroy(ext4_free_data_cachep);
        ext4_groupinfo_destroy_slabs();
        ext4_remove_debugfs_entry();
 }
@@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_data_block_valid(sbi, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
-                           "fs metadata\n", block, block+len);
+                           "fs metadata", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
@@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int bsbits, max;
        ext4_lblk_t end;
-        loff_t size, orig_size, start_off;
+        loff_t size, start_off;
+        loff_t orig_size __maybe_unused;
        ext4_lblk_t start;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_prealloc_space *pa;
@@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
        n = rb_first(&(grp->bb_free_root));
        while (n) {
-                entry = rb_entry(n, struct ext4_free_data, node);
+                entry = rb_entry(n, struct ext4_free_data, efd_node);
-                ext4_set_bits(bitmap, entry->start_cluster, entry->count);
+                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
                n = rb_next(n);
        }
        return;
@@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                return;
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
+        ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
                        " Allocation context details:");
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
+        ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
                        ac->ac_status, ac->ac_flags);
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
+        ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
                        "goal %lu/%lu/%lu@%lu, "
                        "best %lu/%lu/%lu@%lu cr %d",
                        (unsigned long)ac->ac_o_ex.fe_group,
@@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
+        ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
                 ac->ac_ex_scanned, ac->ac_found);
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
+        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
        ngroups = ext4_get_groups_count(sb);
        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4428,9 +4376,9 @@ out:
 static int can_merge(struct ext4_free_data *entry1,
                        struct ext4_free_data *entry2)
 {
-        if ((entry1->t_tid == entry2->t_tid) &&
+        if ((entry1->efd_tid == entry2->efd_tid) &&
-            (entry1->group == entry2->group) &&
+            (entry1->efd_group == entry2->efd_group) &&
-            ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
+            ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
                return 1;
        return 0;
 }
@@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        BUG_ON(e4b->bd_bitmap_page == NULL);
        BUG_ON(e4b->bd_buddy_page == NULL);
-        new_node = &new_entry->node;
+        new_node = &new_entry->efd_node;
-        cluster = new_entry->start_cluster;
+        cluster = new_entry->efd_start_cluster;
        if (!*n) {
                /* first free block exent. We need to
@@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        }
        while (*n) {
                parent = *n;
-                entry = rb_entry(parent, struct ext4_free_data, node);
+                entry = rb_entry(parent, struct ext4_free_data, efd_node);
-                if (cluster < entry->start_cluster)
+                if (cluster < entry->efd_start_cluster)
                        n = &(*n)->rb_left;
-                else if (cluster >= (entry->start_cluster + entry->count))
+                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
                        n = &(*n)->rb_right;
                else {
                        ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        /* Now try to see the extent can be merged to left and right */
        node = rb_prev(new_node);
        if (node) {
-                entry = rb_entry(node, struct ext4_free_data, node);
+                entry = rb_entry(node, struct ext4_free_data, efd_node);
                if (can_merge(entry, new_entry)) {
-                        new_entry->start_cluster = entry->start_cluster;
+                        new_entry->efd_start_cluster = entry->efd_start_cluster;
-                        new_entry->count += entry->count;
+                        new_entry->efd_count += entry->efd_count;
                        rb_erase(node, &(db->bb_free_root));
-                        spin_lock(&sbi->s_md_lock);
+                        ext4_journal_callback_del(handle, &entry->efd_jce);
-                        list_del(&entry->list);
+                        kmem_cache_free(ext4_free_data_cachep, entry);
-                        spin_unlock(&sbi->s_md_lock);
-                        kmem_cache_free(ext4_free_ext_cachep, entry);
                }
        }
        node = rb_next(new_node);
        if (node) {
-                entry = rb_entry(node, struct ext4_free_data, node);
+                entry = rb_entry(node, struct ext4_free_data, efd_node);
                if (can_merge(new_entry, entry)) {
-                        new_entry->count += entry->count;
+                        new_entry->efd_count += entry->efd_count;
                        rb_erase(node, &(db->bb_free_root));
-                        spin_lock(&sbi->s_md_lock);
+                        ext4_journal_callback_del(handle, &entry->efd_jce);
-                        list_del(&entry->list);
+                        kmem_cache_free(ext4_free_data_cachep, entry);
-                        spin_unlock(&sbi->s_md_lock);
-                        kmem_cache_free(ext4_free_ext_cachep, entry);
                }
        }
        /* Add the extent to transaction's private list */
-        spin_lock(&sbi->s_md_lock);
+        ext4_journal_callback_add(handle, ext4_free_data_callback,
-        list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+                                  &new_entry->efd_jce);
-        spin_unlock(&sbi->s_md_lock);
        return 0;
 }
@@ -4691,15 +4634,15 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
                if (!new_entry) {
                        err = -ENOMEM;
                        goto error_return;
                }
-                new_entry->start_cluster = bit;
+                new_entry->efd_start_cluster = bit;
-                new_entry->group  = block_group;
+                new_entry->efd_group = block_group;
-                new_entry->count = count_clusters;
+                new_entry->efd_count = count_clusters;
-                new_entry->t_tid = handle->h_transaction->t_tid;
+                new_entry->efd_tid = handle->h_transaction->t_tid;
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
        start = (e4b.bd_info->bb_first_free > start) ?
                e4b.bd_info->bb_first_free : start;
-        while (start < max) {
+        while (start <= max) {
-                start = mb_find_next_zero_bit(bitmap, max, start);
+                start = mb_find_next_zero_bit(bitmap, max + 1, start);
-                if (start >= max)
+                if (start > max)
                        break;
-                next = mb_find_next_bit(bitmap, max, start);
+                next = mb_find_next_bit(bitmap, max + 1, start);
                if ((next - start) >= minblocks) {
                        ext4_trim_extent(sb, start,
@@ -5027,37 +4970,36 @@ out:
 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
        struct ext4_group_info *grp;
-        ext4_group_t first_group, last_group;
+        ext4_group_t group, first_group, last_group;
-        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
-        uint64_t start, len, minlen, trimmed = 0;
+        uint64_t start, end, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
        int ret = 0;
        start = range->start >> sb->s_blocksize_bits;
-        len = range->len >> sb->s_blocksize_bits;
+        end = start + (range->len >> sb->s_blocksize_bits) - 1;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
+        if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
+            unlikely(start >= max_blks))
                return -EINVAL;
-        if (start + len <= first_data_blk)
+        if (end >= max_blks)
+                end = max_blks - 1;
+        if (end <= first_data_blk)
                goto out;
-        if (start < first_data_blk) {
+        if (start < first_data_blk)
-                len -= first_data_blk - start;
                start = first_data_blk;
-        }
-        /* Determine first and last group to examine based on start and len */
+        /* Determine first and last group to examine based on start and end */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
                                     &first_group, &first_cluster);
-        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
                                     &last_group, &last_cluster);
-        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
-        last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
-        if (first_group > last_group)
+        /* end now represents the last cluster to discard in this group */
-                return -EINVAL;
+        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
        for (group = first_group; group <= last_group; group++) {
                grp = ext4_get_group_info(sb, group);
@@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                }
                /*
-                 * For all the groups except the last one, last block will
+                 * For all the groups except the last one, last cluster will
-                 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
-                 * change it for the last group in which case start +
+                 * change it for the last group, note that last_cluster is
-                 * len < EXT4_BLOCKS_PER_GROUP(sb).
+                 * already computed earlier by ext4_get_group_no_and_offset()
                 */
-                if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
+                if (group == last_group)
-                        last_cluster = first_cluster + len;
+                        end = last_cluster;
-                len -= last_cluster - first_cluster;
                if (grp->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, group, first_cluster,
-                                                last_cluster, minlen);
+                                                end, minlen);
                        if (cnt < 0) {
                                ret = cnt;
                                break;
                        }
+                        trimmed += cnt;
                }
-                trimmed += cnt;
+                /*
+                 * For every group except the first one, we are sure
+                 * that the first cluster to discard will be cluster #0.
+                 */
                first_cluster = 0;
        }
-        range->len = trimmed * sb->s_blocksize;
        if (!ret)
                atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
 out:
+        range->len = trimmed * sb->s_blocksize;
        return ret;
 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3285e3..c070618c21ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
 struct ext4_free_data {
-        /* this links the free block information from group_info */
+        /* MUST be the first member */
-        struct rb_node node;
+        struct ext4_journal_cb_entry    efd_jce;
+        /* ext4_free_data private data starts from here */
-        /* this links the free block information from ext4_sb_info */
+        /* this links the free block information from group_info */
-        struct list_head list;
+        struct rb_node                  efd_node;
        /* group which free block extent belongs */
-        ext4_group_t group;
+        ext4_group_t                    efd_group;
        /* free block extent */
-        ext4_grpblk_t start_cluster;
+        ext4_grpblk_t                   efd_start_cluster;
-        ext4_grpblk_t count;
+        ext4_grpblk_t                   efd_count;
        /* transaction which freed this extent */
-        tid_t   t_tid;
+        tid_t                           efd_tid;
 };
 struct ext4_prealloc_space {
@@ -210,8 +212,6 @@ struct ext4_buddy {
        __u16 bd_blkbits;
        ext4_group_t bd_group;
 };
-#define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7d6bb0acfa6..f39f80f8f2c5 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
                                   S_IFREG, NULL, goal, owner);
        if (IS_ERR(tmp_inode)) {
-                retval = PTR_ERR(inode);
+                retval = PTR_ERR(tmp_inode);
                ext4_journal_stop(handle);
                return retval;
        }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 7ea4ba4eff2a..ed6548d89165 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb,
         * If check_interval in MMP block is larger, use that instead of
         * update_interval from the superblock.
         */
-        if (mmp->mmp_check_interval > mmp_check_interval)
+        if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
-                mmp_check_interval = mmp->mmp_check_interval;
+                mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
        seq = le32_to_cpu(mmp->mmp_seq);
        if (seq == EXT4_MMP_SEQ_CLEAN)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f482375d..349d7b3671c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -468,7 +468,7 @@ fail2:
 fail:
        if (*err == ERR_BAD_DX_DIR)
                ext4_warning(dir->i_sb,
-                             "Corrupt dir inode %ld, running e2fsck is "
+                             "Corrupt dir inode %lu, running e2fsck is "
                             "recommended.", dir->i_ino);
        return NULL;
 }
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 475851896518..74cd1f7f1f88 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode)
 static void put_io_page(struct ext4_io_page *io_page)
 {
        if (atomic_dec_and_test(&io_page->p_count)) {
-                end_page_writeback(io_page->p_page);
                put_page(io_page->p_page);
                kmem_cache_free(io_page_cachep, io_page);
        }
@@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
+        if (io->flag & EXT4_IO_END_DIRECT)
+                inode_dio_done(inode);
        /* Wake up anyone waiting on unwritten extent conversion */
        if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
                wake_up_all(ext4_ioend_wq(io->inode));
@@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work)
        unsigned long           flags;
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (io->flag & EXT4_IO_END_IN_FSYNC)
+                goto requeue;
        if (list_empty(&io->list)) {
                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                goto free;
        }
        if (!mutex_trylock(&inode->i_mutex)) {
+                bool was_queued;
+requeue:
+                was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
+                io->flag |= EXT4_IO_END_QUEUED;
                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                /*
                 * Requeue the work instead of waiting so that the work
@@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work)
                 * yield the cpu if it sees an end_io request that has already
                 * been requeued.
                 */
-                if (io->flag & EXT4_IO_END_QUEUED)
+                if (was_queued)
                        yield();
-                io->flag |= EXT4_IO_END_QUEUED;
                return;
        }
        list_del_init(&io->list);
@@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error)
                        } while (bh != head);
                }
-                put_io_page(io_end->pages[i]);
+                if (atomic_read(&io_end->pages[i]->p_count) == 1)
+                        end_page_writeback(io_end->pages[i]->p_page);
        }
-        io_end->num_io_pages = 0;
        inode = io_end->inode;
        if (error) {
@@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
         * PageWriteback bit from the page to prevent the system from
         * wedging later on.
         */
+        if (atomic_read(&io_page->p_count) == 1)
+                end_page_writeback(page);
        put_io_page(io_page);
        return ret;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f9d948f0eb86..59fa0be27251 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb,
        do_div(reserved_blocks, 100);
        ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+        ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
        le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
                     flex_gd->count);
+        le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+                     flex_gd->count);
        /*
         * We need to protect s_groups_count against other CPUs seeing
@@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
+        ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
@@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        o_blocks_count = ext4_blocks_count(es);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n",
+                ext4_msg(sb, KERN_DEBUG,
-                       o_blocks_count, n_blocks_count);
+                         "extending last group from %llu to %llu blocks",
+                         o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
                return 0;
        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-                printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+                ext4_msg(sb, KERN_ERR,
-                        " too large to resize to %llu blocks safely\n",
+                         "filesystem too large to resize to %llu blocks safely",
-                        sb->s_id, n_blocks_count);
+                         n_blocks_count);
                if (sizeof(sector_t) < 8)
                        ext4_warning(sb, "CONFIG_LBDAF not enabled");
                return -EINVAL;
@@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
        ext4_fsblk_t o_blocks_count;
        ext4_group_t o_group;
        ext4_group_t n_group;
-        ext4_grpblk_t offset;
+        ext4_grpblk_t offset, add;
        unsigned long n_desc_blocks;
        unsigned long o_desc_blocks;
        unsigned long desc_blocks;
@@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
        o_blocks_count = ext4_blocks_count(es);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+                ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
-                       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+                       "to %llu blocks", o_blocks_count, n_blocks_count);
        if (n_blocks_count < o_blocks_count) {
                /* On-line shrinking not supported */
@@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
                return 0;
        ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
-        ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+        ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
        n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
                        EXT4_DESC_PER_BLOCK(sb);
@@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
        }
        brelse(bh);
-        if (offset != 0) {
+        /* extend the last group */
-                /* extend the last group */
+        if (n_group == o_group)
-                ext4_grpblk_t add;
+                add = n_blocks_count - o_blocks_count;
-                add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+        else
+                add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+        if (add > 0) {
                err = ext4_group_extend_no_check(sb, o_blocks_count, add);
                if (err)
                        goto out;
@@ -1674,7 +1681,7 @@ out:
        iput(resize_inode);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
+                ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
-                       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+                       "upto %llu blocks", o_blocks_count, n_blocks_count);
        return err;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 933900909ed0..ceebaf853beb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
 static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
@@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
        if (is_handle_aborted(handle))
                return;
-        printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
+        printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
               caller, line, errstr, err_fn);
        jbd2_journal_abort_handle(handle);
@@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb)
        return bdi->dev == NULL;
 }
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+        struct super_block              *sb = journal->j_private;
+        struct ext4_sb_info             *sbi = EXT4_SB(sb);
+        int                             error = is_journal_aborted(journal);
+        struct ext4_journal_cb_entry    *jce, *tmp;
+        spin_lock(&sbi->s_md_lock);
+        list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+                list_del_init(&jce->jce_list);
+                spin_unlock(&sbi->s_md_lock);
+                jce->jce_func(sb, jce, error);
+                spin_lock(&sbi->s_md_lock);
+        }
+        spin_unlock(&sbi->s_md_lock);
+}
 /* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
@@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function,
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk(KERN_CONT "block %llu: ", block);
+                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
-        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
+                       "inode #%lu: block %llu: comm %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       block, current->comm, &vaf);
+        else
+                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+                       "inode #%lu: comm %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       current->comm, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function,
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
        if (IS_ERR(path))
                path = "(unknown)";
-        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               inode->i_sb->s_id, function, line, inode->i_ino);
-        if (block)
-                printk(KERN_CONT "block %llu: ", block);
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
+        if (block)
+                printk(KERN_CRIT
+                       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+                       "block %llu: comm %s: path %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       block, current->comm, path, &vaf);
+        else
+                printk(KERN_CRIT
+                       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+                       "comm %s: path %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb)
        destroy_workqueue(sbi->dio_unwritten_wq);
        lock_super(sb);
-        if (sb->s_dirt)
-                ext4_commit_super(sb, 1);
        if (sbi->s_journal) {
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext4_commit_super(sb, 1);
        }
+        if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
+                ext4_commit_super(sb, 1);
        if (sbi->s_proc) {
+                remove_proc_entry("options", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
        kobject_del(&sbi->s_kobj);
@@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode)
        }
 }
-static inline void ext4_show_quota_options(struct seq_file *seq,
-                                           struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_jquota_fmt) {
-                char *fmtname = "";
-                switch (sbi->s_jquota_fmt) {
-                case QFMT_VFS_OLD:
-                        fmtname = "vfsold";
-                        break;
-                case QFMT_VFS_V0:
-                        fmtname = "vfsv0";
-                        break;
-                case QFMT_VFS_V1:
-                        fmtname = "vfsv1";
-                        break;
-                }
-                seq_printf(seq, ",jqfmt=%s", fmtname);
-        }
-        if (sbi->s_qf_names[USRQUOTA])
-                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-        if (sbi->s_qf_names[GRPQUOTA])
-                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-        if (test_opt(sb, USRQUOTA))
-                seq_puts(seq, ",usrquota");
-        if (test_opt(sb, GRPQUOTA))
-                seq_puts(seq, ",grpquota");
-#endif
-}
-/*
- * Show an option if
- *  - it's set to a non-default value OR
- *  - if the per-sb default is different from the global default
- */
-static int ext4_show_options(struct seq_file *seq, struct dentry *root)
-{
-        int def_errors;
-        unsigned long def_mount_opts;
-        struct super_block *sb = root->d_sb;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_super_block *es = sbi->s_es;
-        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-        def_errors     = le16_to_cpu(es->s_errors);
-        if (sbi->s_sb_block != 1)
-                seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
-        if (test_opt(sb, MINIX_DF))
-                seq_puts(seq, ",minixdf");
-        if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
-                seq_puts(seq, ",grpid");
-        if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
-                seq_puts(seq, ",nogrpid");
-        if (sbi->s_resuid != EXT4_DEF_RESUID ||
-            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
-                seq_printf(seq, ",resuid=%u", sbi->s_resuid);
-        }
-        if (sbi->s_resgid != EXT4_DEF_RESGID ||
-            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
-                seq_printf(seq, ",resgid=%u", sbi->s_resgid);
-        }
-        if (test_opt(sb, ERRORS_RO)) {
-                if (def_errors == EXT4_ERRORS_PANIC ||
-                    def_errors == EXT4_ERRORS_CONTINUE) {
-                        seq_puts(seq, ",errors=remount-ro");
-                }
-        }
-        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
-                seq_puts(seq, ",errors=continue");
-        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
-                seq_puts(seq, ",errors=panic");
-        if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
-                seq_puts(seq, ",nouid32");
-        if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
-                seq_puts(seq, ",debug");
-#ifdef CONFIG_EXT4_FS_XATTR
-        if (test_opt(sb, XATTR_USER))
-                seq_puts(seq, ",user_xattr");
-        if (!test_opt(sb, XATTR_USER))
-                seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
-                seq_puts(seq, ",acl");
-        if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
-                seq_puts(seq, ",noacl");
-#endif
-        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
-                seq_printf(seq, ",commit=%u",
-                           (unsigned) (sbi->s_commit_interval / HZ));
-        }
-        if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
-                seq_printf(seq, ",min_batch_time=%u",
-                           (unsigned) sbi->s_min_batch_time);
-        }
-        if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
-                seq_printf(seq, ",max_batch_time=%u",
-                           (unsigned) sbi->s_max_batch_time);
-        }
-        /*
-         * We're changing the default of barrier mount option, so
-         * let's always display its mount state so it's clear what its
-         * status is.
-         */
-        seq_puts(seq, ",barrier=");
-        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
-                seq_puts(seq, ",journal_async_commit");
-        else if (test_opt(sb, JOURNAL_CHECKSUM))
-                seq_puts(seq, ",journal_checksum");
-        if (test_opt(sb, I_VERSION))
-                seq_puts(seq, ",i_version");
-        if (!test_opt(sb, DELALLOC) &&
-            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
-                seq_puts(seq, ",nodelalloc");
-        if (!test_opt(sb, MBLK_IO_SUBMIT))
-                seq_puts(seq, ",nomblk_io_submit");
-        if (sbi->s_stripe)
-                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
-        /*
-         * journal mode get enabled in different ways
-         * So just print the value even if we didn't specify it
-         */
-        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-                seq_puts(seq, ",data=journal");
-        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-                seq_puts(seq, ",data=ordered");
-        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-                seq_puts(seq, ",data=writeback");
-        if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
-                seq_printf(seq, ",inode_readahead_blks=%u",
-                           sbi->s_inode_readahead_blks);
-        if (test_opt(sb, DATA_ERR_ABORT))
-                seq_puts(seq, ",data_err=abort");
-        if (test_opt(sb, NO_AUTO_DA_ALLOC))
-                seq_puts(seq, ",noauto_da_alloc");
-        if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
-                seq_puts(seq, ",discard");
-        if (test_opt(sb, NOLOAD))
-                seq_puts(seq, ",norecovery");
-        if (test_opt(sb, DIOREAD_NOLOCK))
-                seq_puts(seq, ",dioread_nolock");
-        if (test_opt(sb, BLOCK_VALIDITY) &&
-            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
-                seq_puts(seq, ",block_validity");
-        if (!test_opt(sb, INIT_INODE_TABLE))
-                seq_puts(seq, ",noinit_itable");
-        else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
-                seq_printf(seq, ",init_itable=%u",
-                           (unsigned) sbi->s_li_wait_mult);
-        ext4_show_quota_options(seq, sb);
-        return 0;
-}
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
                                        u64 ino, u32 generation)
 {
@@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = {
 enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+        Opt_nouid32, Opt_debug, Opt_removed,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
+        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
-        Opt_journal_update, Opt_journal_dev,
+        Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
-        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
+        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1350,20 +1203,19 @@ static const match_table_t tokens = {
        {Opt_err_ro, "errors=remount-ro"},
        {Opt_nouid32, "nouid32"},
        {Opt_debug, "debug"},
-        {Opt_oldalloc, "oldalloc"},
+        {Opt_removed, "oldalloc"},
-        {Opt_orlov, "orlov"},
+        {Opt_removed, "orlov"},
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
-        {Opt_noload, "noload"},
        {Opt_noload, "norecovery"},
-        {Opt_nobh, "nobh"},
+        {Opt_noload, "noload"},
-        {Opt_bh, "bh"},
+        {Opt_removed, "nobh"},
+        {Opt_removed, "bh"},
        {Opt_commit, "commit=%u"},
        {Opt_min_batch_time, "min_batch_time=%u"},
        {Opt_max_batch_time, "max_batch_time=%u"},
-        {Opt_journal_update, "journal=update"},
        {Opt_journal_dev, "journal_dev=%u"},
        {Opt_journal_checksum, "journal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
@@ -1389,7 +1241,6 @@ static const match_table_t tokens = {
        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
-        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_mblk_io_submit, "mblk_io_submit"},
@@ -1408,6 +1259,11 @@ static const match_table_t tokens = {
        {Opt_init_itable, "init_itable=%u"},
        {Opt_init_itable, "init_itable"},
        {Opt_noinit_itable, "noinit_itable"},
+        {Opt_removed, "check=none"},    /* mount option from ext2/3 */
+        {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
+        {Opt_removed, "reservation"},   /* mount option from ext2/3 */
+        {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+        {Opt_removed, "journal=%u"},    /* mount option from ext2/3 */
        {Opt_err, NULL},
 };
@@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 }
 #endif
-static int parse_options(char *options, struct super_block *sb,
+#define MOPT_SET        0x0001
-                         unsigned long *journal_devnum,
+#define MOPT_CLEAR      0x0002
-                         unsigned int *journal_ioprio,
+#define MOPT_NOSUPPORT  0x0004
-                         ext4_fsblk_t *n_blocks_count, int is_remount)
+#define MOPT_EXPLICIT   0x0008
-{
+#define MOPT_CLEAR_ERR  0x0010
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
+#define MOPT_GTE0       0x0020
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int data_opt = 0;
-        int option;
 #ifdef CONFIG_QUOTA
-        int qfmt;
+#define MOPT_Q          0
+#define MOPT_QFMT       0x0040
+#else
+#define MOPT_Q          MOPT_NOSUPPORT
+#define MOPT_QFMT       MOPT_NOSUPPORT
 #endif
+#define MOPT_DATAJ      0x0080
-        if (!options)
-                return 1;
+static const struct mount_opts {
+        int     token;
-        while ((p = strsep(&options, ",")) != NULL) {
+        int     mount_opt;
-                int token;
+        int     flags;
-                if (!*p)
+} ext4_mount_opts[] = {
-                        continue;
+        {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+        {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
-                /*
+        {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
-                 * Initialize args struct so we know whether arg was
+        {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
-                 * found; some options take optional arguments.
+        {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
-                 */
+        {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
-                args[0].to = args[0].from = NULL;
+        {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
-                token = match_token(p, tokens, args);
+        {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
-                switch (token) {
+        {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
-                case Opt_bsd_df:
+        {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
-                        clear_opt(sb, MINIX_DF);
+        {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
-                        break;
+        {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
-                case Opt_minix_df:
+        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
-                        set_opt(sb, MINIX_DF);
+        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+                                    EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
-                        break;
+        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
-                case Opt_grpid:
+        {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
-                        set_opt(sb, GRPID);
+        {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+        {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
-                        break;
+        {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
-                case Opt_nogrpid:
+        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
-                        clear_opt(sb, GRPID);
+        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+        {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
-                        break;
+        {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
-                case Opt_resuid:
+        {Opt_commit, 0, MOPT_GTE0},
-                        if (match_int(&args[0], &option))
+        {Opt_max_batch_time, 0, MOPT_GTE0},
-                                return 0;
+        {Opt_min_batch_time, 0, MOPT_GTE0},
-                        sbi->s_resuid = option;
+        {Opt_inode_readahead_blks, 0, MOPT_GTE0},
-                        break;
+        {Opt_init_itable, 0, MOPT_GTE0},
-                case Opt_resgid:
+        {Opt_stripe, 0, MOPT_GTE0},
-                        if (match_int(&args[0], &option))
+        {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
-                                return 0;
+        {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
-                        sbi->s_resgid = option;
+        {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
-                        break;
-                case Opt_sb:
-                        /* handled by get_sb_block() instead of here */
-                        /* *sb_block = match_int(&args[0]); */
-                        break;
-                case Opt_err_panic:
-                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sb, ERRORS_PANIC);
-                        break;
-                case Opt_err_ro:
-                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sb, ERRORS_RO);
-                        break;
-                case Opt_err_cont:
-                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sb, ERRORS_CONT);
-                        break;
-                case Opt_nouid32:
-                        set_opt(sb, NO_UID32);
-                        break;
-                case Opt_debug:
-                        set_opt(sb, DEBUG);
-                        break;
-                case Opt_oldalloc:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated oldalloc option");
-                        break;
-                case Opt_orlov:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated orlov option");
-                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
-                case Opt_user_xattr:
+        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
-                        set_opt(sb, XATTR_USER);
+        {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
-                        break;
-                case Opt_nouser_xattr:
-                        clear_opt(sb, XATTR_USER);
-                        break;
 #else
-                case Opt_user_xattr:
+        {Opt_user_xattr, 0, MOPT_NOSUPPORT},
-                case Opt_nouser_xattr:
+        {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
-                        ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
-                        break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-                case Opt_acl:
+        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
-                        set_opt(sb, POSIX_ACL);
+        {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
-                        break;
-                case Opt_noacl:
-                        clear_opt(sb, POSIX_ACL);
-                        break;
 #else
-                case Opt_acl:
+        {Opt_acl, 0, MOPT_NOSUPPORT},
-                case Opt_noacl:
+        {Opt_noacl, 0, MOPT_NOSUPPORT},
-                        ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
-                        break;
 #endif
-                case Opt_journal_update:
+        {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
-                        /* @@@ FIXME */
+        {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
-                        /* Eventually we will want to be able to create
+        {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
-                           a journal file here.  For now, only allow the
+        {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
-                           user to specify an existing inode to be the
+                                                        MOPT_SET | MOPT_Q},
-                           journal file. */
+        {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
-                        if (is_remount) {
+                                                        MOPT_SET | MOPT_Q},
-                                ext4_msg(sb, KERN_ERR,
+        {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
-                                         "Cannot specify journal on remount");
+                       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
-                                return 0;
+        {Opt_usrjquota, 0, MOPT_Q},
-                        }
+        {Opt_grpjquota, 0, MOPT_Q},
-                        set_opt(sb, UPDATE_JOURNAL);
+        {Opt_offusrjquota, 0, MOPT_Q},
-                        break;
+        {Opt_offgrpjquota, 0, MOPT_Q},
-                case Opt_journal_dev:
+        {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
-                        if (is_remount) {
+        {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+        {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+        {Opt_err, 0, 0}
+};
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+                            substring_t *args, unsigned long *journal_devnum,
+                            unsigned int *journal_ioprio, int is_remount)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        const struct mount_opts *m;
+        int arg = 0;
+        if (args->from && match_int(args, &arg))
+                return -1;
+        switch (token) {
+        case Opt_noacl:
+        case Opt_nouser_xattr:
+                ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+                break;
+        case Opt_sb:
+                return 1;       /* handled by get_sb_block() */
+        case Opt_removed:
+                ext4_msg(sb, KERN_WARNING,
+                         "Ignoring removed %s option", opt);
+                return 1;
+        case Opt_resuid:
+                sbi->s_resuid = arg;
+                return 1;
+        case Opt_resgid:
+                sbi->s_resgid = arg;
+                return 1;
+        case Opt_abort:
+                sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                return 1;
+        case Opt_i_version:
+                sb->s_flags |= MS_I_VERSION;
+                return 1;
+        case Opt_journal_dev:
+                if (is_remount) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "Cannot specify journal on remount");
+                        return -1;
+                }
+                *journal_devnum = arg;
+                return 1;
+        case Opt_journal_ioprio:
+                if (arg < 0 || arg > 7)
+                        return -1;
+                *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+                return 1;
+        }
+        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+                if (token != m->token)
+                        continue;
+                if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+                        return -1;
+                if (m->flags & MOPT_EXPLICIT)
+                        set_opt2(sb, EXPLICIT_DELALLOC);
+                if (m->flags & MOPT_CLEAR_ERR)
+                        clear_opt(sb, ERRORS_MASK);
+                if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+                        ext4_msg(sb, KERN_ERR, "Cannot change quota "
+                                 "options when quota turned on");
+                        return -1;
+                }
+                if (m->flags & MOPT_NOSUPPORT) {
+                        ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+                } else if (token == Opt_commit) {
+                        if (arg == 0)
+                                arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+                        sbi->s_commit_interval = HZ * arg;
+                } else if (token == Opt_max_batch_time) {
+                        if (arg == 0)
+                                arg = EXT4_DEF_MAX_BATCH_TIME;
+                        sbi->s_max_batch_time = arg;
+                } else if (token == Opt_min_batch_time) {
+                        sbi->s_min_batch_time = arg;
+                } else if (token == Opt_inode_readahead_blks) {
+                        if (arg > (1 << 30))
+                                return -1;
+                        if (arg && !is_power_of_2(arg)) {
                                ext4_msg(sb, KERN_ERR,
-                                        "Cannot specify journal on remount");
+                                         "EXT4-fs: inode_readahead_blks"
-                                return 0;
+                                         " must be a power of 2");
+                                return -1;
                        }
-                        if (match_int(&args[0], &option))
+                        sbi->s_inode_readahead_blks = arg;
-                                return 0;
+                } else if (token == Opt_init_itable) {
-                        *journal_devnum = option;
+                        set_opt(sb, INIT_INODE_TABLE);
-                        break;
+                        if (!args->from)
-                case Opt_journal_checksum:
+                                arg = EXT4_DEF_LI_WAIT_MULT;
-                        set_opt(sb, JOURNAL_CHECKSUM);
+                        sbi->s_li_wait_mult = arg;
-                        break;
+                } else if (token == Opt_stripe) {
-                case Opt_journal_async_commit:
+                        sbi->s_stripe = arg;
-                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
+                } else if (m->flags & MOPT_DATAJ) {
-                        set_opt(sb, JOURNAL_CHECKSUM);
-                        break;
-                case Opt_noload:
-                        set_opt(sb, NOLOAD);
-                        break;
-                case Opt_commit:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        if (option == 0)
-                                option = JBD2_DEFAULT_MAX_COMMIT_AGE;
-                        sbi->s_commit_interval = HZ * option;
-                        break;
-                case Opt_max_batch_time:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        if (option == 0)
-                                option = EXT4_DEF_MAX_BATCH_TIME;
-                        sbi->s_max_batch_time = option;
-                        break;
-                case Opt_min_batch_time:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        sbi->s_min_batch_time = option;
-                        break;
-                case Opt_data_journal:
-                        data_opt = EXT4_MOUNT_JOURNAL_DATA;
-                        goto datacheck;
-                case Opt_data_ordered:
-                        data_opt = EXT4_MOUNT_ORDERED_DATA;
-                        goto datacheck;
-                case Opt_data_writeback:
-                        data_opt = EXT4_MOUNT_WRITEBACK_DATA;
-                datacheck:
                        if (is_remount) {
                                if (!sbi->s_journal)
                                        ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
-                                else if (test_opt(sb, DATA_FLAGS) != data_opt) {
+                                else if (test_opt(sb, DATA_FLAGS) !=
+                                         m->mount_opt) {
                                        ext4_msg(sb, KERN_ERR,
-                                                "Cannot change data mode on remount");
+                                         "Cannot change data mode on remount");
-                                        return 0;
+                                        return -1;
                                }
                        } else {
                                clear_opt(sb, DATA_FLAGS);
-                                sbi->s_mount_opt |= data_opt;
+                                sbi->s_mount_opt |= m->mount_opt;
                        }
-                        break;
-                case Opt_data_err_abort:
-                        set_opt(sb, DATA_ERR_ABORT);
-                        break;
-                case Opt_data_err_ignore:
-                        clear_opt(sb, DATA_ERR_ABORT);
-                        break;
 #ifdef CONFIG_QUOTA
-                case Opt_usrjquota:
+                } else if (token == Opt_usrjquota) {
                        if (!set_qf_name(sb, USRQUOTA, &args[0]))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (token == Opt_grpjquota) {
-                case Opt_grpjquota:
                        if (!set_qf_name(sb, GRPQUOTA, &args[0]))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (token == Opt_offusrjquota) {
-                case Opt_offusrjquota:
                        if (!clear_qf_name(sb, USRQUOTA))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (token == Opt_offgrpjquota) {
-                case Opt_offgrpjquota:
                        if (!clear_qf_name(sb, GRPQUOTA))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (m->flags & MOPT_QFMT) {
-                case Opt_jqfmt_vfsold:
-                        qfmt = QFMT_VFS_OLD;
-                        goto set_qf_format;
-                case Opt_jqfmt_vfsv0:
-                        qfmt = QFMT_VFS_V0;
-                        goto set_qf_format;
-                case Opt_jqfmt_vfsv1:
-                        qfmt = QFMT_VFS_V1;
-set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
-                            sbi->s_jquota_fmt != qfmt) {
+                            sbi->s_jquota_fmt != m->mount_opt) {
-                                ext4_msg(sb, KERN_ERR, "Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot "
-                                        "journaled quota options when "
+                                         "change journaled quota options "
-                                        "quota turned on");
+                                         "when quota turned on");
-                                return 0;
+                                return -1;
-                        }
-                        sbi->s_jquota_fmt = qfmt;
-                        break;
-                case Opt_quota:
-                case Opt_usrquota:
-                        set_opt(sb, QUOTA);
-                        set_opt(sb, USRQUOTA);
-                        break;
-                case Opt_grpquota:
-                        set_opt(sb, QUOTA);
-                        set_opt(sb, GRPQUOTA);
-                        break;
-                case Opt_noquota:
-                        if (sb_any_quota_loaded(sb)) {
-                                ext4_msg(sb, KERN_ERR, "Cannot change quota "
-                                        "options when quota turned on");
-                                return 0;
                        }
-                        clear_opt(sb, QUOTA);
+                        sbi->s_jquota_fmt = m->mount_opt;
-                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sb, GRPQUOTA);
-                        break;
-#else
-                case Opt_quota:
-                case Opt_usrquota:
-                case Opt_grpquota:
-                        ext4_msg(sb, KERN_ERR,
-                                "quota options not supported");
-                        break;
-                case Opt_usrjquota:
-                case Opt_grpjquota:
-                case Opt_offusrjquota:
-                case Opt_offgrpjquota:
-                case Opt_jqfmt_vfsold:
-                case Opt_jqfmt_vfsv0:
-                case Opt_jqfmt_vfsv1:
-                        ext4_msg(sb, KERN_ERR,
-                                "journaled quota options not supported");
-                        break;
-                case Opt_noquota:
-                        break;
 #endif
-                case Opt_abort:
+                } else {
-                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                        if (!args->from)
-                        break;
+                                arg = 1;
-                case Opt_nobarrier:
+                        if (m->flags & MOPT_CLEAR)
-                        clear_opt(sb, BARRIER);
+                                arg = !arg;
-                        break;
+                        else if (unlikely(!(m->flags & MOPT_SET))) {
-                case Opt_barrier:
+                                ext4_msg(sb, KERN_WARNING,
-                        if (args[0].from) {
+                                         "buggy handling of option %s", opt);
-                                if (match_int(&args[0], &option))
+                                WARN_ON(1);
-                                        return 0;
+                                return -1;
-                        } else
-                                option = 1;     /* No argument, default to 1 */
-                        if (option)
-                                set_opt(sb, BARRIER);
-                        else
-                                clear_opt(sb, BARRIER);
-                        break;
-                case Opt_ignore:
-                        break;
-                case Opt_resize:
-                        if (!is_remount) {
-                                ext4_msg(sb, KERN_ERR,
-                                        "resize option only available "
-                                        "for remount");
-                                return 0;
-                        }
-                        if (match_int(&args[0], &option) != 0)
-                                return 0;
-                        *n_blocks_count = option;
-                        break;
-                case Opt_nobh:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated nobh option");
-                        break;
-                case Opt_bh:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated bh option");
-                        break;
-                case Opt_i_version:
-                        set_opt(sb, I_VERSION);
-                        sb->s_flags |= MS_I_VERSION;
-                        break;
-                case Opt_nodelalloc:
-                        clear_opt(sb, DELALLOC);
-                        clear_opt2(sb, EXPLICIT_DELALLOC);
-                        break;
-                case Opt_mblk_io_submit:
-                        set_opt(sb, MBLK_IO_SUBMIT);
-                        break;
-                case Opt_nomblk_io_submit:
-                        clear_opt(sb, MBLK_IO_SUBMIT);
-                        break;
-                case Opt_stripe:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        sbi->s_stripe = option;
-                        break;
-                case Opt_delalloc:
-                        set_opt(sb, DELALLOC);
-                        set_opt2(sb, EXPLICIT_DELALLOC);
-                        break;
-                case Opt_block_validity:
-                        set_opt(sb, BLOCK_VALIDITY);
-                        break;
-                case Opt_noblock_validity:
-                        clear_opt(sb, BLOCK_VALIDITY);
-                        break;
-                case Opt_inode_readahead_blks:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0 || option > (1 << 30))
-                                return 0;
-                        if (option && !is_power_of_2(option)) {
-                                ext4_msg(sb, KERN_ERR,
-                                         "EXT4-fs: inode_readahead_blks"
-                                         " must be a power of 2");
-                                return 0;
                        }
-                        sbi->s_inode_readahead_blks = option;
+                        if (arg != 0)
-                        break;
+                                sbi->s_mount_opt |= m->mount_opt;
-                case Opt_journal_ioprio:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0 || option > 7)
-                                break;
-                        *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
-                                                            option);
-                        break;
-                case Opt_noauto_da_alloc:
-                        set_opt(sb, NO_AUTO_DA_ALLOC);
-                        break;
-                case Opt_auto_da_alloc:
-                        if (args[0].from) {
-                                if (match_int(&args[0], &option))
-                                        return 0;
-                        } else
-                                option = 1;     /* No argument, default to 1 */
-                        if (option)
-                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sb,NO_AUTO_DA_ALLOC);
+                                sbi->s_mount_opt &= ~m->mount_opt;
-                        break;
-                case Opt_discard:
-                        set_opt(sb, DISCARD);
-                        break;
-                case Opt_nodiscard:
-                        clear_opt(sb, DISCARD);
-                        break;
-                case Opt_dioread_nolock:
-                        set_opt(sb, DIOREAD_NOLOCK);
-                        break;
-                case Opt_dioread_lock:
-                        clear_opt(sb, DIOREAD_NOLOCK);
-                        break;
-                case Opt_init_itable:
-                        set_opt(sb, INIT_INODE_TABLE);
-                        if (args[0].from) {
-                                if (match_int(&args[0], &option))
-                                        return 0;
-                        } else
-                                option = EXT4_DEF_LI_WAIT_MULT;
-                        if (option < 0)
-                                return 0;
-                        sbi->s_li_wait_mult = option;
-                        break;
-                case Opt_noinit_itable:
-                        clear_opt(sb, INIT_INODE_TABLE);
-                        break;
-                default:
-                        ext4_msg(sb, KERN_ERR,
-                               "Unrecognized mount option \"%s\" "
-                               "or missing value", p);
-                        return 0;
                }
+                return 1;
+        }
+        ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+                 "or missing value", opt);
+        return -1;
+}
+static int parse_options(char *options, struct super_block *sb,
+                         unsigned long *journal_devnum,
+                         unsigned int *journal_ioprio,
+                         int is_remount)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int token;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = 0;
+                token = match_token(p, tokens, args);
+                if (handle_mount_opt(sb, p, token, args, journal_devnum,
+                                     journal_ioprio, is_remount) < 0)
+                        return 0;
        }
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
@@ -1942,6 +1651,160 @@ set_qf_format:
        return 1;
 }
+static inline void ext4_show_quota_options(struct seq_file *seq,
+                                           struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (sbi->s_jquota_fmt) {
+                char *fmtname = "";
+                switch (sbi->s_jquota_fmt) {
+                case QFMT_VFS_OLD:
+                        fmtname = "vfsold";
+                        break;
+                case QFMT_VFS_V0:
+                        fmtname = "vfsv0";
+                        break;
+                case QFMT_VFS_V1:
+                        fmtname = "vfsv1";
+                        break;
+                }
+                seq_printf(seq, ",jqfmt=%s", fmtname);
+        }
+        if (sbi->s_qf_names[USRQUOTA])
+                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+        if (sbi->s_qf_names[GRPQUOTA])
+                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+        if (test_opt(sb, USRQUOTA))
+                seq_puts(seq, ",usrquota");
+        if (test_opt(sb, GRPQUOTA))
+                seq_puts(seq, ",grpquota");
+#endif
+}
+static const char *token2str(int token)
+{
+        static const struct match_token *t;
+        for (t = tokens; t->token != Opt_err; t++)
+                if (t->token == token && !strchr(t->pattern, '='))
+                        break;
+        return t->pattern;
+}
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+                              int nodefs)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+        const struct mount_opts *m;
+        char sep = nodefs ? '\n' : ',';
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+        if (sbi->s_sb_block != 1)
+                SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+                int want_set = m->flags & MOPT_SET;
+                if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+                    (m->flags & MOPT_CLEAR_ERR))
+                        continue;
+                if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+                        continue; /* skip if same as the default */
+                if ((want_set &&
+                     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+                    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+                        continue; /* select Opt_noFoo vs Opt_Foo */
+                SEQ_OPTS_PRINT("%s", token2str(m->token));
+        }
+        if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
+            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+                SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
+        if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
+            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+                SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
+        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+                SEQ_OPTS_PUTS("errors=remount-ro");
+        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+                SEQ_OPTS_PUTS("errors=continue");
+        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+                SEQ_OPTS_PUTS("errors=panic");
+        if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+                SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+        if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+                SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+        if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+                SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+        if (sb->s_flags & MS_I_VERSION)
+                SEQ_OPTS_PUTS("i_version");
+        if (nodefs || sbi->s_stripe)
+                SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+        if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+                        SEQ_OPTS_PUTS("data=journal");
+                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                        SEQ_OPTS_PUTS("data=ordered");
+                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+                        SEQ_OPTS_PUTS("data=writeback");
+        }
+        if (nodefs ||
+            sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+                SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+                               sbi->s_inode_readahead_blks);
+        if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+                       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+        ext4_show_quota_options(seq, sb);
+        return 0;
+}
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+        return _ext4_show_options(seq, root->d_sb, 0);
+}
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+        struct super_block *sb = seq->private;
+        int rc;
+        seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+        rc = _ext4_show_options(seq, sb, 1);
+        seq_puts(seq, "\n");
+        return rc;
+}
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+        return single_open(file, options_seq_show, PDE(inode)->data);
+}
+static const struct file_operations ext4_seq_options_fops = {
+        .owner = THIS_MODULE,
+        .open = options_open_fs,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                            int read_only)
 {
@@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void)
                ext4_clear_request_list();
                kfree(ext4_li_info);
                ext4_li_info = NULL;
-                printk(KERN_CRIT "EXT4: error %d creating inode table "
+                printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
                                 "initialization thread\n",
                                 err);
                return err;
@@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sb, DEBUG);
-        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
-                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
-                        "2.6.38");
                set_opt(sb, GRPID);
-        }
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
        /* xattr user namespace & acls are now defaulted on */
@@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-                           &journal_devnum, &journal_ioprio, NULL, 0)) {
+                           &journal_devnum, &journal_ioprio, 0)) {
                ext4_msg(sb, KERN_WARNING,
                         "failed to parse options in superblock: %s",
                         sbi->s_es->s_mount_opts);
        }
+        sbi->s_def_mount_opt = sbi->s_mount_opt;
        if (!parse_options((char *) data, sb, &journal_devnum,
-                           &journal_ioprio, NULL, 0))
+                           &journal_ioprio, 0))
                goto failed_mount;
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #else
                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-                sb->s_dirt = 1;
        }
        /* Handle clustersize */
@@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+        if (sbi->s_proc)
+                proc_create_data("options", S_IRUGO, sbi->s_proc,
+                                 &ext4_seq_options_fops, sb);
        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
@@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
        /*
         * The journal may have updated the bg summary counts, so we
         * need to update the global counters.
@@ -3861,6 +3727,7 @@ failed_mount2:
        ext4_kvfree(sbi->s_group_desc);
 failed_mount:
        if (sbi->s_proc) {
+                remove_proc_entry("options", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
 #ifdef CONFIG_QUOTA
@@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb,
        if (!(journal->j_flags & JBD2_BARRIER))
                ext4_msg(sb, KERN_INFO, "barriers disabled");
-        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
-                err = jbd2_journal_update_format(journal);
-                if (err)  {
-                        ext4_msg(sb, KERN_ERR, "error updating journal");
-                        jbd2_journal_destroy(journal);
-                        return err;
-                }
-        }
        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
                err = jbd2_journal_wipe(journal, !really_read_only);
        if (!err) {
@@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        ext4_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        int enable_quota = 0;
@@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        /*
         * Allow the "check" option to be passed as a remount option.
         */
-        if (!parse_options(data, sb, NULL, &journal_ioprio,
+        if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
-                           &n_blocks_count, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
@@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
        }
-        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
-                n_blocks_count > ext4_blocks_count(es)) {
                if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
                        err = -EROFS;
                        goto restore_opts;
@@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        if (sbi->s_journal)
                                ext4_clear_journal_err(sb, es);
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
-                        if ((err = ext4_group_extend(sb, es, n_blocks_count)))
-                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
                        if (EXT4_HAS_INCOMPAT_FEATURE(sb,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d89a220..e88748e55c0f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -82,8 +82,8 @@
                printk("\n"); \
        } while (0)
 #else
-# define ea_idebug(f...)
+# define ea_idebug(inode, fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
-# define ea_bdebug(f...)
+# define ea_bdebug(bh, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
 #endif
 static void ext4_xattr_cache_insert(struct buffer_head *);
@@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
 static inline int
 ext4_xattr_check_block(struct buffer_head *bh)
 {
-        int error;
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1))
                return -EIO;
-        error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+        return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
-        return error;
 }
 static inline int
@@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
        error = -ENODATA;
        if (!EXT4_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %llu",
+                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh)
                goto cleanup;
@@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        error = 0;
        if (!EXT4_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %llu",
+                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        error = -EIO;
        if (!bh)
@@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ext4_free_blocks(handle, inode, bh, 0, 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
+                unlock_buffer(bh);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+                if (ce)
+                        mb_cache_entry_release(ce);
+                unlock_buffer(bh);
                error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
                dquot_free_block(inode, 1);
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
-                if (ce)
-                        mb_cache_entry_release(ce);
        }
-        unlock_buffer(bh);
 out:
        ext4_std_error(inode->i_sb, error);
        return;
@@ -834,7 +834,8 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-                        ea_idebug(inode, "creating block %d", block);
+                        ea_idebug(inode, "creating block %llu",
+                                  (unsigned long long)block);
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a81eb2367d39..98ae804f5273 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -521,57 +521,46 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
                op = &outname[*outlen * sizeof(wchar_t)];
        } else {
-                if (nls) {
+                for (i = 0, ip = name, op = outname, *outlen = 0;
-                        for (i = 0, ip = name, op = outname, *outlen = 0;
+                         i < len && *outlen < FAT_LFN_LEN;
-                             i < len && *outlen <= FAT_LFN_LEN;
+                         *outlen += 1) {
-                             *outlen += 1)
+                        if (escape && (*ip == ':')) {
-                        {
+                                if (i > len - 5)
-                                if (escape && (*ip == ':')) {
+                                        return -EINVAL;
-                                        if (i > len - 5)
+                                ec = 0;
-                                                return -EINVAL;
+                                for (k = 1; k < 5; k++) {
-                                        ec = 0;
+                                        nc = ip[k];
-                                        for (k = 1; k < 5; k++) {
+                                        ec <<= 4;
-                                                nc = ip[k];
+                                        if (nc >= '0' && nc <= '9') {
-                                                ec <<= 4;
+                                                ec |= nc - '0';
-                                                if (nc >= '0' && nc <= '9') {
+                                                continue;
-                                                        ec |= nc - '0';
-                                                        continue;
-                                                }
-                                                if (nc >= 'a' && nc <= 'f') {
-                                                        ec |= nc - ('a' - 10);
-                                                        continue;
-                                                }
-                                                if (nc >= 'A' && nc <= 'F') {
-                                                        ec |= nc - ('A' - 10);
-                                                        continue;
-                                                }
-                                                return -EINVAL;
                                        }
-                                        *op++ = ec & 0xFF;
+                                        if (nc >= 'a' && nc <= 'f') {
-                                        *op++ = ec >> 8;
+                                                ec |= nc - ('a' - 10);
-                                        ip += 5;
+                                                continue;
-                                        i += 5;
+                                        }
-                                } else {
+                                        if (nc >= 'A' && nc <= 'F') {
-                                        if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
+                                                ec |= nc - ('A' - 10);
-                                                return -EINVAL;
+                                                continue;
-                                        ip += charlen;
+                                        }
-                                        i += charlen;
+                                        return -EINVAL;
-                                        op += 2;
                                }
+                                *op++ = ec & 0xFF;
+                                *op++ = ec >> 8;
+                                ip += 5;
+                                i += 5;
+                        } else {
+                                charlen = nls->char2uni(ip, len - i,
+                                                                        (wchar_t *)op);
+                                if (charlen < 0)
+                                        return -EINVAL;
+                                ip += charlen;
+                                i += charlen;
+                                op += 2;
                        }
-                        if (i < len)
-                                return -ENAMETOOLONG;
-                } else {
-                        for (i = 0, ip = name, op = outname, *outlen = 0;
-                             i < len && *outlen <= FAT_LFN_LEN;
-                             i++, *outlen += 1)
-                        {
-                                *op++ = *ip++;
-                                *op++ = 0;
-                        }
-                        if (i < len)
-                                return -ENAMETOOLONG;
                }
+                if (i < len)
+                        return -ENAMETOOLONG;
        }
        *longlen = *outlen;
diff --git a/fs/file.c b/fs/file.c
index 4c6992d8f3ba..3c426de7203a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,7 +6,7 @@
 *  Manage the dynamic fd arrays in the process files_struct.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 77b535ac7136..539f36cf3e4a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,7 +14,7 @@
 */
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -256,7 +256,8 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 }
 /*
- * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
+ * Move expired (dirtied after work->older_than_this) dirty inodes from
+ * @delaying_queue to @dispatch_queue.
 */
 static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
@@ -1148,23 +1149,6 @@ out_unlock_inode:
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
-/*
- * Write out a superblock's list of dirty inodes.  A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched.  For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * The inodes to be written are parked on bdi->b_io.  They are moved back onto
- * bdi->b_dirty as they are selected for writing.  This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
- */
 static void wait_sb_inodes(struct super_block *sb)
 {
        struct inode *inode, *old_inode = NULL;
@@ -1364,8 +1348,6 @@ int write_inode_now(struct inode *inode, int sync)
        ret = writeback_single_inode(inode, wb, &wbc);
        spin_unlock(&inode->i_lock);
        spin_unlock(&wb->list_lock);
-        if (sync)
-                inode_sync_wait(inode);
        return ret;
 }
 EXPORT_SYMBOL(write_inode_now);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 6324c4274959..e159e682ad4c 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/path.h>
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 3cbfa93cd782..1fe731337f07 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -67,7 +67,8 @@ extern int access_file(char *path, int r, int w, int x);
 extern int open_file(char *path, int r, int w, int append);
 extern void *open_dir(char *path, int *err_out);
 extern char *read_dir(void *stream, unsigned long long *pos,
-                      unsigned long long *ino_out, int *len_out);
+                      unsigned long long *ino_out, int *len_out,
+                      unsigned int *type_out);
 extern void close_file(void *stream);
 extern int replace_file(int oldfd, int fd);
 extern void close_dir(void *stream);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 588d45885a6f..07c516bfea76 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -283,6 +283,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
        char *name;
        unsigned long long next, ino;
        int error, len;
+        unsigned int type;
        name = dentry_name(file->f_path.dentry);
        if (name == NULL)
@@ -292,9 +293,9 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
        if (dir == NULL)
                return -error;
        next = file->f_pos;
-        while ((name = read_dir(dir, &next, &ino, &len)) != NULL) {
+        while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
                error = (*filldir)(ent, name, len, file->f_pos,
-                                   ino, DT_UNKNOWN);
+                                   ino, type);
                if (error) break;
                file->f_pos = next;
        }
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index dd7bc38a3825..a74ad0d371c2 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -98,7 +98,8 @@ void *open_dir(char *path, int *err_out)
 }
 char *read_dir(void *stream, unsigned long long *pos,
-               unsigned long long *ino_out, int *len_out)
+               unsigned long long *ino_out, int *len_out,
+               unsigned int *type_out)
 {
        DIR *dir = stream;
        struct dirent *ent;
@@ -109,6 +110,7 @@ char *read_dir(void *stream, unsigned long long *pos,
                return NULL;
        *len_out = strlen(ent->d_name);
        *ino_out = ent->d_ino;
+        *type_out = ent->d_type;
        *pos = telldir(dir);
        return ent->d_name;
 }
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 066836e81848..29167bebe874 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -10,7 +10,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/security.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uaccess.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202903fb..c78841ee81cf 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)
 * whole transaction.
 *
 * Requires j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
 static int __try_to_free_cp_buf(struct journal_head *jh)
 {
        int ret = 0;
        struct buffer_head *bh = jh2bh(jh);
-        if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+        if (jh->b_transaction == NULL && !buffer_locked(bh) &&
            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
                /*
                 * Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
                get_bh(bh);
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
-                jbd_unlock_bh_state(bh);
                BUFFER_TRACE(bh, "release");
                __brelse(bh);
-        } else {
-                jbd_unlock_bh_state(bh);
        }
        return ret;
 }
@@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 }
 /*
- * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
- * The caller must restart a list walk.  Wait for someone else to run
- * jbd_unlock_bh_state().
- */
-static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
-        __releases(journal->j_list_lock)
-{
-        get_bh(bh);
-        spin_unlock(&journal->j_list_lock);
-        jbd_lock_bh_state(bh);
-        jbd_unlock_bh_state(bh);
-        put_bh(bh);
-}
-/*
 * Clean up transaction's list of buffers submitted for io.
 * We wait for any pending IO to complete and remove any clean
 * buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +203,9 @@ restart:
        while (!released && transaction->t_checkpoint_io_list) {
                jh = transaction->t_checkpoint_io_list;
                bh = jh2bh(jh);
-                if (!jbd_trylock_bh_state(bh)) {
-                        jbd_sync_bh(journal, bh);
-                        spin_lock(&journal->j_list_lock);
-                        goto restart;
-                }
                get_bh(bh);
                if (buffer_locked(bh)) {
                        spin_unlock(&journal->j_list_lock);
-                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
                        /* the journal_head may have gone by now */
                        BUFFER_TRACE(bh, "brelse");
@@ -246,7 +221,6 @@ restart:
                 * it has been written out and so we can drop it from the list
                 */
                released = __jbd2_journal_remove_checkpoint(jh);
-                jbd_unlock_bh_state(bh);
                __brelse(bh);
        }
@@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count)
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = journal->j_chkpt_bhs[i];
-                clear_buffer_jwrite(bh);
                BUFFER_TRACE(bh, "brelse");
                __brelse(bh);
        }
@@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count)
 * be written out.
 *
 * Called with j_list_lock held and drops it if 1 is returned
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
                            int *batch_count, transaction_t *transaction)
@@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
        if (buffer_locked(bh)) {
                get_bh(bh);
                spin_unlock(&journal->j_list_lock);
-                jbd_unlock_bh_state(bh);
                wait_on_buffer(bh);
                /* the journal_head may have gone by now */
                BUFFER_TRACE(bh, "brelse");
@@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
-                jbd_unlock_bh_state(bh);
                if (unlikely(journal->j_flags & JBD2_UNMOUNT))
                        /*
                         * The journal thread is dead; so starting and
@@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                if (unlikely(buffer_write_io_error(bh)))
                        ret = -EIO;
                get_bh(bh);
-                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __jbd2_journal_remove_checkpoint(jh);
                spin_unlock(&journal->j_list_lock);
-                jbd_unlock_bh_state(bh);
                __brelse(bh);
        } else {
                /*
@@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                BUFFER_TRACE(bh, "queue");
                get_bh(bh);
                J_ASSERT_BH(bh, !buffer_jwrite(bh));
-                set_buffer_jwrite(bh);
                journal->j_chkpt_bhs[*batch_count] = bh;
                __buffer_relink_io(jh);
-                jbd_unlock_bh_state(bh);
                transaction->t_chp_stats.cs_written++;
                (*batch_count)++;
                if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +373,7 @@ restart:
                int retry = 0, err;
                while (!retry && transaction->t_checkpoint_list) {
-                        struct buffer_head *bh;
                        jh = transaction->t_checkpoint_list;
-                        bh = jh2bh(jh);
-                        if (!jbd_trylock_bh_state(bh)) {
-                                jbd_sync_bh(journal, bh);
-                                retry = 1;
-                                break;
-                        }
                        retry = __process_buffer(journal, jh, &batch_count,
                                                 transaction);
                        if (retry < 0 && !result)
@@ -478,79 +436,28 @@ out:
 int jbd2_cleanup_journal_tail(journal_t *journal)
 {
-        transaction_t * transaction;
        tid_t           first_tid;
-        unsigned long   blocknr, freed;
+        unsigned long   blocknr;
        if (is_journal_aborted(journal))
                return 1;
-        /* OK, work out the oldest transaction remaining in the log, and
+        if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
-         * the log block it starts at.
-         *
-         * If the log is now empty, we need to work out which is the
-         * next transaction ID we will write, and where it will
-         * start. */
-        write_lock(&journal->j_state_lock);
-        spin_lock(&journal->j_list_lock);
-        transaction = journal->j_checkpoint_transactions;
-        if (transaction) {
-                first_tid = transaction->t_tid;
-                blocknr = transaction->t_log_start;
-        } else if ((transaction = journal->j_committing_transaction) != NULL) {
-                first_tid = transaction->t_tid;
-                blocknr = transaction->t_log_start;
-        } else if ((transaction = journal->j_running_transaction) != NULL) {
-                first_tid = transaction->t_tid;
-                blocknr = journal->j_head;
-        } else {
-                first_tid = journal->j_transaction_sequence;
-                blocknr = journal->j_head;
-        }
-        spin_unlock(&journal->j_list_lock);
-        J_ASSERT(blocknr != 0);
-        /* If the oldest pinned transaction is at the tail of the log
-           already then there's not much we can do right now. */
-        if (journal->j_tail_sequence == first_tid) {
-                write_unlock(&journal->j_state_lock);
                return 1;
-        }
+        J_ASSERT(blocknr != 0);
-        /* OK, update the superblock to recover the freed space.
-         * Physical blocks come first: have we wrapped beyond the end of
-         * the log?  */
-        freed = blocknr - journal->j_tail;
-        if (blocknr < journal->j_tail)
-                freed = freed + journal->j_last - journal->j_first;
-        trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
-        jbd_debug(1,
-                  "Cleaning journal tail from %d to %d (offset %lu), "
-                  "freeing %lu\n",
-                  journal->j_tail_sequence, first_tid, blocknr, freed);
-        journal->j_free += freed;
-        journal->j_tail_sequence = first_tid;
-        journal->j_tail = blocknr;
-        write_unlock(&journal->j_state_lock);
        /*
-         * If there is an external journal, we need to make sure that
+         * We need to make sure that any blocks that were recently written out
-         * any data blocks that were recently written out --- perhaps
+         * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
-         * by jbd2_log_do_checkpoint() --- are flushed out before we
+         * we drop the transactions from the journal. It's unlikely this will
-         * drop the transactions from the external journal.  It's
+         * be necessary, especially with an appropriately sized journal, but we
-         * unlikely this will be necessary, especially with a
+         * need this to guarantee correctness.  Fortunately
-         * appropriately sized journal, but we need this to guarantee
+         * jbd2_cleanup_journal_tail() doesn't get called all that often.
-         * correctness.  Fortunately jbd2_cleanup_journal_tail()
-         * doesn't get called all that often.
         */
-        if ((journal->j_fs_dev != journal->j_dev) &&
+        if (journal->j_flags & JBD2_BARRIER)
-            (journal->j_flags & JBD2_BARRIER))
                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-        if (!(journal->j_flags & JBD2_ABORT))
-                jbd2_journal_update_superblock(journal, 1);
+        __jbd2_update_log_tail(journal, first_tid, blocknr);
        return 0;
 }
@@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
        do {
                jh = next_jh;
                next_jh = jh->b_cpnext;
-                /* Use trylock because of the ranking */
+                ret = __try_to_free_cp_buf(jh);
-                if (jbd_trylock_bh_state(jh2bh(jh))) {
+                if (ret) {
-                        ret = __try_to_free_cp_buf(jh);
+                        freed++;
-                        if (ret) {
+                        if (ret == 2) {
-                                freed++;
+                                *released = 1;
-                                if (ret == 2) {
+                                return freed;
-                                        *released = 1;
-                                        return freed;
-                                }
                        }
                }
                /*
@@ -673,9 +577,7 @@ out:
 * The function can free jh and bh.
 *
 * This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
 */
 int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 {
        struct transaction_chp_stats_s *stats;
@@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
                                    transaction->t_tid, stats);
        __jbd2_journal_drop_transaction(journal, transaction);
-        kfree(transaction);
+        jbd2_journal_free_transaction(transaction);
        /* Just in case anybody was waiting for more transactions to be
           checkpointed... */
@@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(journal->j_committing_transaction != transaction);
        J_ASSERT(journal->j_running_transaction != transaction);
+        trace_jbd2_drop_transaction(journal, transaction);
        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29853deee5ed..806525a7269c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -330,6 +330,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
        struct blk_plug plug;
+        /* Tail of the journal */
+        unsigned long first_block;
+        tid_t first_tid;
+        int update_tail;
        /*
         * First job: lock down the current transaction and wait for
@@ -339,7 +343,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
        if (journal->j_flags & JBD2_FLUSHED) {
                jbd_debug(3, "super block updated\n");
-                jbd2_journal_update_superblock(journal, 1);
+                mutex_lock(&journal->j_checkpoint_mutex);
+                /*
+                 * We hold j_checkpoint_mutex so tail cannot change under us.
+                 * We don't need any special data guarantees for writing sb
+                 * since journal is empty and it is ok for write to be
+                 * flushed only with transaction commit.
+                 */
+                jbd2_journal_update_sb_log_tail(journal,
+                                                journal->j_tail_sequence,
+                                                journal->j_tail,
+                                                WRITE_SYNC);
+                mutex_unlock(&journal->j_checkpoint_mutex);
        } else {
                jbd_debug(3, "superblock not updated\n");
        }
@@ -676,10 +691,30 @@ start_journal_io:
                err = 0;
        }
+        /*
+         * Get current oldest transaction in the log before we issue flush
+         * to the filesystem device. After the flush we can be sure that
+         * blocks of all older transactions are checkpointed to persistent
+         * storage and we will be safe to update journal start in the
+         * superblock with the numbers we get here.
+         */
+        update_tail =
+                jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
        write_lock(&journal->j_state_lock);
+        if (update_tail) {
+                long freed = first_block - journal->j_tail;
+                if (first_block < journal->j_tail)
+                        freed += journal->j_last - journal->j_first;
+                /* Update tail only if we free significant amount of space */
+                if (freed < journal->j_maxlen / 4)
+                        update_tail = 0;
+        }
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
        commit_transaction->t_state = T_COMMIT_DFLUSH;
        write_unlock(&journal->j_state_lock);
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
@@ -830,6 +865,14 @@ wait_for_iobuf:
        if (err)
                jbd2_journal_abort(journal, err);
+        /*
+         * Now disk caches for filesystem device are flushed so we are safe to
+         * erase checkpointed transactions from the log by updating journal
+         * superblock.
+         */
+        if (update_tail)
+                jbd2_update_log_tail(journal, first_tid, first_block);
        /* End of a transaction!  Finally, we can do checkpoint
           processing: any buffers committed as a result of this
           transaction can be removed from any checkpoint list it was on
@@ -1047,7 +1090,7 @@ restart_loop:
        jbd_debug(1, "JBD2: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
        if (to_free)
-                kfree(commit_transaction);
+                jbd2_journal_free_transaction(commit_transaction);
        wake_up(&journal->j_wait_done_commit);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c6d22745553f..1afb701622b0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -70,7 +70,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);
 EXPORT_SYMBOL(jbd2_journal_init_dev);
 EXPORT_SYMBOL(jbd2_journal_init_inode);
-EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -95,7 +94,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 EXPORT_SYMBOL(jbd2_inode_cache);
-static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
@@ -745,6 +743,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
        return jbd2_journal_add_journal_head(bh);
 }
+/*
+ * Return tid of the oldest transaction in the journal and block in the journal
+ * where the transaction starts.
+ *
+ * If the journal is now empty, return which will be the next transaction ID
+ * we will write and where will that transaction start.
+ *
+ * The return value is 0 if journal tail cannot be pushed any further, 1 if
+ * it can.
+ */
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+                              unsigned long *block)
+{
+        transaction_t *transaction;
+        int ret;
+        read_lock(&journal->j_state_lock);
+        spin_lock(&journal->j_list_lock);
+        transaction = journal->j_checkpoint_transactions;
+        if (transaction) {
+                *tid = transaction->t_tid;
+                *block = transaction->t_log_start;
+        } else if ((transaction = journal->j_committing_transaction) != NULL) {
+                *tid = transaction->t_tid;
+                *block = transaction->t_log_start;
+        } else if ((transaction = journal->j_running_transaction) != NULL) {
+                *tid = transaction->t_tid;
+                *block = journal->j_head;
+        } else {
+                *tid = journal->j_transaction_sequence;
+                *block = journal->j_head;
+        }
+        ret = tid_gt(*tid, journal->j_tail_sequence);
+        spin_unlock(&journal->j_list_lock);
+        read_unlock(&journal->j_state_lock);
+        return ret;
+}
+/*
+ * Update information in journal structure and in on disk journal superblock
+ * about log tail. This function does not check whether information passed in
+ * really pushes log tail further. It's responsibility of the caller to make
+ * sure provided log tail information is valid (e.g. by holding
+ * j_checkpoint_mutex all the time between computing log tail and calling this
+ * function as is the case with jbd2_cleanup_journal_tail()).
+ *
+ * Requires j_checkpoint_mutex
+ */
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+        unsigned long freed;
+        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+        /*
+         * We cannot afford for write to remain in drive's caches since as
+         * soon as we update j_tail, next transaction can start reusing journal
+         * space and if we lose sb update during power failure we'd replay
+         * old transaction with possibly newly overwritten data.
+         */
+        jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+        write_lock(&journal->j_state_lock);
+        freed = block - journal->j_tail;
+        if (block < journal->j_tail)
+                freed += journal->j_last - journal->j_first;
+        trace_jbd2_update_log_tail(journal, tid, block, freed);
+        jbd_debug(1,
+                  "Cleaning journal tail from %d to %d (offset %lu), "
+                  "freeing %lu\n",
+                  journal->j_tail_sequence, tid, block, freed);
+        journal->j_free += freed;
+        journal->j_tail_sequence = tid;
+        journal->j_tail = block;
+        write_unlock(&journal->j_state_lock);
+}
+/*
+ * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * provided log tail and locks j_checkpoint_mutex. So it is safe against races
+ * with other threads updating log tail.
+ */
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+        mutex_lock(&journal->j_checkpoint_mutex);
+        if (tid_gt(tid, journal->j_tail_sequence))
+                __jbd2_update_log_tail(journal, tid, block);
+        mutex_unlock(&journal->j_checkpoint_mutex);
+}
 struct jbd2_stats_proc_session {
        journal_t *journal;
        struct transaction_stats_s *stats;
@@ -1113,40 +1203,45 @@ static int journal_reset(journal_t *journal)
        journal->j_max_transaction_buffers = journal->j_maxlen / 4;
-        /* Add the dynamic fields and write it to disk. */
-        jbd2_journal_update_superblock(journal, 1);
-        return jbd2_journal_start_thread(journal);
-}
-/**
- * void jbd2_journal_update_superblock() - Update journal sb on disk.
- * @journal: The journal to update.
- * @wait: Set to '0' if you don't want to wait for IO completion.
- *
- * Update a journal's dynamic superblock fields and write it to disk,
- * optionally waiting for the IO to complete.
- */
-void jbd2_journal_update_superblock(journal_t *journal, int wait)
-{
-        journal_superblock_t *sb = journal->j_superblock;
-        struct buffer_head *bh = journal->j_sb_buffer;
        /*
         * As a special case, if the on-disk copy is already marked as needing
-         * no recovery (s_start == 0) and there are no outstanding transactions
+         * no recovery (s_start == 0), then we can safely defer the superblock
-         * in the filesystem, then we can safely defer the superblock update
+         * update until the next commit by setting JBD2_FLUSHED.  This avoids
-         * until the next commit by setting JBD2_FLUSHED.  This avoids
         * attempting a write to a potential-readonly device.
         */
-        if (sb->s_start == 0 && journal->j_tail_sequence ==
+        if (sb->s_start == 0) {
-                                journal->j_transaction_sequence) {
                jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
                        "(start %ld, seq %d, errno %d)\n",
                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
-                goto out;
+                journal->j_flags |= JBD2_FLUSHED;
+        } else {
+                /* Lock here to make assertions happy... */
+                mutex_lock(&journal->j_checkpoint_mutex);
+                /*
+                 * Update log tail information. We use WRITE_FUA since new
+                 * transaction will start reusing journal space and so we
+                 * must make sure information about current log tail is on
+                 * disk before that.
+                 */
+                jbd2_journal_update_sb_log_tail(journal,
+                                                journal->j_tail_sequence,
+                                                journal->j_tail,
+                                                WRITE_FUA);
+                mutex_unlock(&journal->j_checkpoint_mutex);
        }
+        return jbd2_journal_start_thread(journal);
+}
+static void jbd2_write_superblock(journal_t *journal, int write_op)
+{
+        struct buffer_head *bh = journal->j_sb_buffer;
+        int ret;
+        trace_jbd2_write_superblock(journal, write_op);
+        if (!(journal->j_flags & JBD2_BARRIER))
+                write_op &= ~(REQ_FUA | REQ_FLUSH);
+        lock_buffer(bh);
        if (buffer_write_io_error(bh)) {
                /*
                 * Oh, dear.  A previous attempt to write the journal
@@ -1162,48 +1257,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }
+        get_bh(bh);
+        bh->b_end_io = end_buffer_write_sync;
+        ret = submit_bh(write_op, bh);
+        wait_on_buffer(bh);
+        if (buffer_write_io_error(bh)) {
+                clear_buffer_write_io_error(bh);
+                set_buffer_uptodate(bh);
+                ret = -EIO;
+        }
+        if (ret) {
+                printk(KERN_ERR "JBD2: Error %d detected when updating "
+                       "journal superblock for %s.\n", ret,
+                       journal->j_devname);
+        }
+}
+/**
+ * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+                                     unsigned long tail_block, int write_op)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+        jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+                  tail_block, tail_tid);
+        sb->s_sequence = cpu_to_be32(tail_tid);
+        sb->s_start    = cpu_to_be32(tail_block);
+        jbd2_write_superblock(journal, write_op);
+        /* Log is no longer empty */
+        write_lock(&journal->j_state_lock);
+        WARN_ON(!sb->s_sequence);
+        journal->j_flags &= ~JBD2_FLUSHED;
+        write_unlock(&journal->j_state_lock);
+}
+/**
+ * jbd2_mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void jbd2_mark_journal_empty(journal_t *journal)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        read_lock(&journal->j_state_lock);
-        jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
+        jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
-                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+                  journal->j_tail_sequence);
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
-        sb->s_start    = cpu_to_be32(journal->j_tail);
+        sb->s_start    = cpu_to_be32(0);
-        sb->s_errno    = cpu_to_be32(journal->j_errno);
        read_unlock(&journal->j_state_lock);
-        BUFFER_TRACE(bh, "marking dirty");
+        jbd2_write_superblock(journal, WRITE_FUA);
-        mark_buffer_dirty(bh);
-        if (wait) {
-                sync_dirty_buffer(bh);
-                if (buffer_write_io_error(bh)) {
-                        printk(KERN_ERR "JBD2: I/O error detected "
-                               "when updating journal superblock for %s.\n",
-                               journal->j_devname);
-                        clear_buffer_write_io_error(bh);
-                        set_buffer_uptodate(bh);
-                }
-        } else
-                write_dirty_buffer(bh, WRITE);
-out:
-        /* If we have just flushed the log (by marking s_start==0), then
-         * any future commit will have to be careful to update the
-         * superblock again to re-record the true start of the log. */
+        /* Log is no longer empty */
        write_lock(&journal->j_state_lock);
-        if (sb->s_start)
+        journal->j_flags |= JBD2_FLUSHED;
-                journal->j_flags &= ~JBD2_FLUSHED;
-        else
-                journal->j_flags |= JBD2_FLUSHED;
        write_unlock(&journal->j_state_lock);
 }
+/**
+ * jbd2_journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno.  Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+static void jbd2_journal_update_sb_errno(journal_t *journal)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        read_lock(&journal->j_state_lock);
+        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
+                  journal->j_errno);
+        sb->s_errno    = cpu_to_be32(journal->j_errno);
+        read_unlock(&journal->j_state_lock);
+        jbd2_write_superblock(journal, WRITE_SYNC);
+}
 /*
 * Read the superblock for a given journal, performing initial
 * validation of the format.
 */
 static int journal_get_superblock(journal_t *journal)
 {
        struct buffer_head *bh;
@@ -1397,14 +1550,11 @@ int jbd2_journal_destroy(journal_t *journal)
        if (journal->j_sb_buffer) {
                if (!is_journal_aborted(journal)) {
-                        /* We can now mark the journal as empty. */
+                        mutex_lock(&journal->j_checkpoint_mutex);
-                        journal->j_tail = 0;
+                        jbd2_mark_journal_empty(journal);
-                        journal->j_tail_sequence =
+                        mutex_unlock(&journal->j_checkpoint_mutex);
-                                ++journal->j_transaction_sequence;
+                } else
-                        jbd2_journal_update_superblock(journal, 1);
-                } else {
                        err = -EIO;
-                }
                brelse(journal->j_sb_buffer);
        }
@@ -1551,61 +1701,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
 EXPORT_SYMBOL(jbd2_journal_clear_features);
 /**
- * int jbd2_journal_update_format () - Update on-disk journal structure.
- * @journal: Journal to act on.
- *
- * Given an initialised but unloaded journal struct, poke about in the
- * on-disk structure to update it to the most recent supported version.
- */
-int jbd2_journal_update_format (journal_t *journal)
-{
-        journal_superblock_t *sb;
-        int err;
-        err = journal_get_superblock(journal);
-        if (err)
-                return err;
-        sb = journal->j_superblock;
-        switch (be32_to_cpu(sb->s_header.h_blocktype)) {
-        case JBD2_SUPERBLOCK_V2:
-                return 0;
-        case JBD2_SUPERBLOCK_V1:
-                return journal_convert_superblock_v1(journal, sb);
-        default:
-                break;
-        }
-        return -EINVAL;
-}
-static int journal_convert_superblock_v1(journal_t *journal,
-                                         journal_superblock_t *sb)
-{
-        int offset, blocksize;
-        struct buffer_head *bh;
-        printk(KERN_WARNING
-                "JBD2: Converting superblock from version 1 to 2.\n");
-        /* Pre-initialise new fields to zero */
-        offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
-        blocksize = be32_to_cpu(sb->s_blocksize);
-        memset(&sb->s_feature_compat, 0, blocksize-offset);
-        sb->s_nr_users = cpu_to_be32(1);
-        sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-        journal->j_format_version = 2;
-        bh = journal->j_sb_buffer;
-        BUFFER_TRACE(bh, "marking dirty");
-        mark_buffer_dirty(bh);
-        sync_dirty_buffer(bh);
-        return 0;
-}
-/**
 * int jbd2_journal_flush () - Flush journal
 * @journal: Journal to act on.
 *
@@ -1618,7 +1713,6 @@ int jbd2_journal_flush(journal_t *journal)
 {
        int err = 0;
        transaction_t *transaction = NULL;
-        unsigned long old_tail;
        write_lock(&journal->j_state_lock);
@@ -1653,6 +1747,7 @@ int jbd2_journal_flush(journal_t *journal)
        if (is_journal_aborted(journal))
                return -EIO;
+        mutex_lock(&journal->j_checkpoint_mutex);
        jbd2_cleanup_journal_tail(journal);
        /* Finally, mark the journal as really needing no recovery.
@@ -1660,14 +1755,9 @@ int jbd2_journal_flush(journal_t *journal)
         * the magic code for a fully-recovered superblock.  Any future
         * commits of data to the journal will restore the current
         * s_start value. */
+        jbd2_mark_journal_empty(journal);
+        mutex_unlock(&journal->j_checkpoint_mutex);
        write_lock(&journal->j_state_lock);
-        old_tail = journal->j_tail;
-        journal->j_tail = 0;
-        write_unlock(&journal->j_state_lock);
-        jbd2_journal_update_superblock(journal, 1);
-        write_lock(&journal->j_state_lock);
-        journal->j_tail = old_tail;
        J_ASSERT(!journal->j_running_transaction);
        J_ASSERT(!journal->j_committing_transaction);
        J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1707,8 +1797,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)
                write ? "Clearing" : "Ignoring");
        err = jbd2_journal_skip_recovery(journal);
-        if (write)
+        if (write) {
-                jbd2_journal_update_superblock(journal, 1);
+                /* Lock to make assertions happy... */
+                mutex_lock(&journal->j_checkpoint_mutex);
+                jbd2_mark_journal_empty(journal);
+                mutex_unlock(&journal->j_checkpoint_mutex);
+        }
 no_recovery:
        return err;
@@ -1758,7 +1852,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
        __jbd2_journal_abort_hard(journal);
        if (errno)
-                jbd2_journal_update_superblock(journal, 1);
+                jbd2_journal_update_sb_errno(journal);
 }
 /**
@@ -2016,7 +2110,7 @@ static struct kmem_cache *jbd2_journal_head_cache;
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
-static int journal_init_jbd2_journal_head_cache(void)
+static int jbd2_journal_init_journal_head_cache(void)
 {
        int retval;
@@ -2034,7 +2128,7 @@ static int journal_init_jbd2_journal_head_cache(void)
        return retval;
 }
-static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+static void jbd2_journal_destroy_journal_head_cache(void)
 {
        if (jbd2_journal_head_cache) {
                kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2322,7 +2416,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
-static int __init journal_init_handle_cache(void)
+static int __init jbd2_journal_init_handle_cache(void)
 {
        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
        if (jbd2_handle_cache == NULL) {
@@ -2357,17 +2451,20 @@ static int __init journal_init_caches(void)
        ret = jbd2_journal_init_revoke_caches();
        if (ret == 0)
-                ret = journal_init_jbd2_journal_head_cache();
+                ret = jbd2_journal_init_journal_head_cache();
+        if (ret == 0)
+                ret = jbd2_journal_init_handle_cache();
        if (ret == 0)
-                ret = journal_init_handle_cache();
+                ret = jbd2_journal_init_transaction_cache();
        return ret;
 }
 static void jbd2_journal_destroy_caches(void)
 {
        jbd2_journal_destroy_revoke_caches();
-        jbd2_journal_destroy_jbd2_journal_head_cache();
+        jbd2_journal_destroy_journal_head_cache();
        jbd2_journal_destroy_handle_cache();
+        jbd2_journal_destroy_transaction_cache();
        jbd2_journal_destroy_slabs();
 }
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf1390..c1a03354a22f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/crc32.h>
+#include <linux/blkdev.h>
 #endif
 /*
@@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
        err2 = sync_blockdev(journal->j_fs_dev);
        if (!err)
                err = err2;
+        /* Make sure all replayed data is on permanent storage */
+        if (journal->j_flags & JBD2_BARRIER)
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
        return err;
 }
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867d6cc9..6973705d6a3d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void)
        J_ASSERT(!jbd2_revoke_record_cache);
        J_ASSERT(!jbd2_revoke_table_cache);
-        jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
+        jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
-                                           sizeof(struct jbd2_revoke_record_s),
+                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
-                                           0,
-                                           SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
-                                           NULL);
        if (!jbd2_revoke_record_cache)
                goto record_cache_failure;
-        jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
+        jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
-                                           sizeof(struct jbd2_revoke_table_s),
+                                             SLAB_TEMPORARY);
-                                           0, SLAB_TEMPORARY, NULL);
        if (!jbd2_revoke_table_cache)
                goto table_cache_failure;
        return 0;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5aba56e1fd5..ddcd3549c6c2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -33,6 +33,35 @@
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
+static struct kmem_cache *transaction_cache;
+int __init jbd2_journal_init_transaction_cache(void)
+{
+        J_ASSERT(!transaction_cache);
+        transaction_cache = kmem_cache_create("jbd2_transaction_s",
+                                        sizeof(transaction_t),
+                                        0,
+                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+                                        NULL);
+        if (transaction_cache)
+                return 0;
+        return -ENOMEM;
+}
+void jbd2_journal_destroy_transaction_cache(void)
+{
+        if (transaction_cache) {
+                kmem_cache_destroy(transaction_cache);
+                transaction_cache = NULL;
+        }
+}
+void jbd2_journal_free_transaction(transaction_t *transaction)
+{
+        if (unlikely(ZERO_OR_NULL_PTR(transaction)))
+                return;
+        kmem_cache_free(transaction_cache, transaction);
+}
 /*
 * jbd2_get_transaction: obtain a new transaction_t object.
 *
@@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 alloc_transaction:
        if (!journal->j_running_transaction) {
-                new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
+                new_transaction = kmem_cache_alloc(transaction_cache,
+                                                   gfp_mask | __GFP_ZERO);
                if (!new_transaction) {
                        /*
                         * If __GFP_FS is not present, then we may be
@@ -162,7 +192,7 @@ repeat:
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
-                kfree(new_transaction);
+                jbd2_journal_free_transaction(new_transaction);
                return -EROFS;
        }
@@ -284,7 +314,7 @@ repeat:
        read_unlock(&journal->j_state_lock);
        lock_map_acquire(&handle->h_lockdep_map);
-        kfree(new_transaction);
+        jbd2_journal_free_transaction(new_transaction);
        return 0;
 }
@@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 * of these pointers, it could go bad.  Generally the caller needs to re-read
 * the pointer from the transaction_t.
 *
- * Called under j_list_lock.  The journal may not be locked.
+ * Called under j_list_lock.
 */
-void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 {
        struct journal_head **list = NULL;
        transaction_t *transaction;
@@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
        spin_lock(&journal->j_list_lock);
        if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
-                if (jh->b_jlist == BJ_None) {
+                JBUFFER_TRACE(jh, "remove from checkpoint list");
-                        JBUFFER_TRACE(jh, "remove from checkpoint list");
+                __jbd2_journal_remove_checkpoint(jh);
-                        __jbd2_journal_remove_checkpoint(jh);
-                }
        }
        spin_unlock(&journal->j_list_lock);
 out:
@@ -1949,6 +1977,8 @@ zap_buffer_unlocked:
        clear_buffer_mapped(bh);
        clear_buffer_req(bh);
        clear_buffer_new(bh);
+        clear_buffer_delay(bh);
+        clear_buffer_unwritten(bh);
        bh->b_bdev = NULL;
        return may_free;
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index 722e0d5ba182..4a0d1f06da57 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -3,7 +3,7 @@
 *      Library for filesystems writers.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index f848b52c67b1..3ddcbb1c0a43 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -598,7 +598,7 @@ static struct rpc_procinfo	nlm4_procedures[] = {
        PROC(GRANTED_RES,       res,            norep),
 };
-struct rpc_version      nlm_version4 = {
+const struct rpc_version nlm_version4 = {
        .number         = 4,
        .nrprocs        = ARRAY_SIZE(nlm4_procedures),
        .procs          = nlm4_procedures,
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8d4ea8351e3d..ba1dc2eebd1e 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -62,7 +62,8 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
        host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
                                   nlm_init->protocol, nlm_version,
-                                   nlm_init->hostname, nlm_init->noresvport);
+                                   nlm_init->hostname, nlm_init->noresvport,
+                                   nlm_init->net);
        if (host == NULL) {
                lockd_down();
                return ERR_PTR(-ENOLCK);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 180ac34feb9a..3d35e3e80c1c 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -596,19 +596,19 @@ static struct rpc_procinfo	nlm_procedures[] = {
        PROC(GRANTED_RES,       res,            norep),
 };
-static struct rpc_version       nlm_version1 = {
+static const struct rpc_version nlm_version1 = {
                .number         = 1,
                .nrprocs        = ARRAY_SIZE(nlm_procedures),
                .procs          = nlm_procedures,
 };
-static struct rpc_version       nlm_version3 = {
+static const struct rpc_version nlm_version3 = {
                .number         = 3,
                .nrprocs        = ARRAY_SIZE(nlm_procedures),
                .procs          = nlm_procedures,
 };
-static struct rpc_version       *nlm_versions[] = {
+static const struct rpc_version *nlm_versions[] = {
        [1] = &nlm_version1,
        [3] = &nlm_version3,
 #ifdef CONFIG_LOCKD_V4
@@ -618,7 +618,7 @@ static struct rpc_version	*nlm_versions[] = {
 static struct rpc_stat          nlm_rpc_stats;
-struct rpc_program              nlm_program = {
+const struct rpc_program        nlm_program = {
                .name           = "lockd",
                .number         = NLM_PROGRAM,
                .nrvers         = ARRAY_SIZE(nlm_versions),
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 6f29836ec0cb..eb75ca7c2d6e 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -17,6 +17,8 @@
 #include <linux/lockd/lockd.h>
 #include <linux/mutex.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <net/ipv6.h>
 #define NLMDBG_FACILITY         NLMDBG_HOSTCACHE
@@ -54,6 +56,7 @@ struct nlm_lookup_host_info {
        const char              *hostname;      /* remote's hostname */
        const size_t            hostname_len;   /* it's length */
        const int               noresvport;     /* use non-priv port */
+        struct net              *net;           /* network namespace to bind */
 };
 /*
@@ -155,6 +158,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
        INIT_LIST_HEAD(&host->h_reclaim);
        host->h_nsmhandle  = nsm;
        host->h_addrbuf    = nsm->sm_addrbuf;
+        host->net          = ni->net;
 out:
        return host;
@@ -206,7 +210,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const unsigned short protocol,
                                     const u32 version,
                                     const char *hostname,
-                                     int noresvport)
+                                     int noresvport,
+                                     struct net *net)
 {
        struct nlm_lookup_host_info ni = {
                .server         = 0,
@@ -217,6 +222,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .hostname       = hostname,
                .hostname_len   = strlen(hostname),
                .noresvport     = noresvport,
+                .net            = net,
        };
        struct hlist_head *chain;
        struct hlist_node *pos;
@@ -231,6 +237,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
        chain = &nlm_client_hosts[nlm_hash_address(sap)];
        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (host->net != net)
+                        continue;
                if (!rpc_cmp_addr(nlm_addr(host), sap))
                        continue;
@@ -318,6 +326,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
        struct nsm_handle *nsm = NULL;
        struct sockaddr *src_sap = svc_daddr(rqstp);
        size_t src_len = rqstp->rq_daddrlen;
+        struct net *net = rqstp->rq_xprt->xpt_net;
        struct nlm_lookup_host_info ni = {
                .server         = 1,
                .sap            = svc_addr(rqstp),
@@ -326,6 +335,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                .version        = rqstp->rq_vers,
                .hostname       = hostname,
                .hostname_len   = hostname_len,
+                .net            = net,
        };
        dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
@@ -339,6 +349,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
        chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (host->net != net)
+                        continue;
                if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
                        continue;
@@ -431,7 +443,7 @@ nlm_bind_host(struct nlm_host *host)
                        .to_retries     = 5U,
                };
                struct rpc_create_args args = {
-                        .net            = &init_net,
+                        .net            = host->net,
                        .protocol       = host->h_proto,
                        .address        = nlm_addr(host),
                        .addrsize       = host->h_addrlen,
@@ -553,12 +565,8 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
        nsm_release(nsm);
 }
-/*
- * Shut down the hosts module.
- * Note that this routine is called only at server shutdown time.
- */
 void
-nlm_shutdown_hosts(void)
+nlm_shutdown_hosts_net(struct net *net)
 {
        struct hlist_head *chain;
        struct hlist_node *pos;
@@ -570,6 +578,8 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
        for_each_host(host, pos, chain, nlm_server_hosts) {
+                if (net && host->net != net)
+                        continue;
                host->h_expires = jiffies - 1;
                if (host->h_rpcclnt) {
                        rpc_shutdown_client(host->h_rpcclnt);
@@ -580,15 +590,29 @@ nlm_shutdown_hosts(void)
        /* Then, perform a garbage collection pass */
        nlm_gc_hosts();
        mutex_unlock(&nlm_host_mutex);
+}
+/*
+ * Shut down the hosts module.
+ * Note that this routine is called only at server shutdown time.
+ */
+void
+nlm_shutdown_hosts(void)
+{
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host;
+        nlm_shutdown_hosts_net(NULL);
        /* complain if any hosts are left */
        if (nrhosts != 0) {
                printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
                dprintk("lockd: %lu hosts left:\n", nrhosts);
                for_each_host(host, pos, chain, nlm_server_hosts) {
-                        dprintk("       %s (cnt %d use %d exp %ld)\n",
+                        dprintk("       %s (cnt %d use %d exp %ld net %p)\n",
                                host->h_name, atomic_read(&host->h_count),
-                                host->h_inuse, host->h_expires);
+                                host->h_inuse, host->h_expires, host->net);
                }
        }
 }
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 65ba36b80a9e..7ef14b3c5bee 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -47,7 +47,7 @@ struct nsm_res {
        u32                     state;
 };
-static struct rpc_program       nsm_program;
+static const struct rpc_program nsm_program;
 static                          LIST_HEAD(nsm_handles);
 static                          DEFINE_SPINLOCK(nsm_lock);
@@ -62,14 +62,14 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
        return (struct sockaddr *)&nsm->sm_addr;
 }
-static struct rpc_clnt *nsm_create(void)
+static struct rpc_clnt *nsm_create(struct net *net)
 {
        struct sockaddr_in sin = {
                .sin_family             = AF_INET,
                .sin_addr.s_addr        = htonl(INADDR_LOOPBACK),
        };
        struct rpc_create_args args = {
-                .net                    = &init_net,
+                .net                    = net,
                .protocol               = XPRT_TRANSPORT_UDP,
                .address                = (struct sockaddr *)&sin,
                .addrsize               = sizeof(sin),
@@ -83,7 +83,8 @@ static struct rpc_clnt *nsm_create(void)
        return rpc_create(&args);
 }
-static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
+                         struct net *net)
 {
        struct rpc_clnt *clnt;
        int             status;
@@ -99,7 +100,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
                .rpc_resp       = res,
        };
-        clnt = nsm_create();
+        clnt = nsm_create(net);
        if (IS_ERR(clnt)) {
                status = PTR_ERR(clnt);
                dprintk("lockd: failed to create NSM upcall transport, "
@@ -149,7 +150,7 @@ int nsm_monitor(const struct nlm_host *host)
         */
        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
-        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
+        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net);
        if (unlikely(res.status != 0))
                status = -EIO;
        if (unlikely(status < 0)) {
@@ -183,7 +184,7 @@ void nsm_unmonitor(const struct nlm_host *host)
         && nsm->sm_monitored && !nsm->sm_sticky) {
                dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
-                status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+                status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net);
                if (res.status != 0)
                        status = -EIO;
                if (status < 0)
@@ -534,19 +535,19 @@ static struct rpc_procinfo	nsm_procedures[] = {
        },
 };
-static struct rpc_version       nsm_version1 = {
+static const struct rpc_version nsm_version1 = {
                .number         = 1,
                .nrprocs        = ARRAY_SIZE(nsm_procedures),
                .procs          = nsm_procedures
 };
-static struct rpc_version *     nsm_version[] = {
+static const struct rpc_version *nsm_version[] = {
        [1] = &nsm_version1,
 };
 static struct rpc_stat          nsm_stats;
-static struct rpc_program       nsm_program = {
+static const struct rpc_program nsm_program = {
                .name           = "statd",
                .number         = NSM_PROGRAM,
                .nrvers         = ARRAY_SIZE(nsm_version),
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
new file mode 100644
index 000000000000..ce227e0fbc5c
--- /dev/null
+++ b/fs/lockd/netns.h
@@ -0,0 +1,12 @@
+#ifndef __LOCKD_NETNS_H__
+#define __LOCKD_NETNS_H__
+#include <net/netns/generic.h>
+struct lockd_net {
+        unsigned int nlmsvc_users;
+};
+extern int lockd_net_id;
+#endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index c061b9aa7ddb..2774e1013b34 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,6 +35,8 @@
 #include <linux/lockd/lockd.h>
 #include <linux/nfs.h>
+#include "netns.h"
 #define NLMDBG_FACILITY         NLMDBG_SVC
 #define LOCKD_BUFSIZE           (1024 + NLMSVC_XDRSIZE)
 #define ALLOWED_SIGS            (sigmask(SIGKILL))
@@ -50,6 +52,8 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst          *nlmsvc_rqst;
 unsigned long                   nlmsvc_timeout;
+int lockd_net_id;
 /*
 * These can be set at insmod time (useful for NFS as root filesystem),
 * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
@@ -189,27 +193,29 @@ lockd(void *vrqstp)
 }
 static int create_lockd_listener(struct svc_serv *serv, const char *name,
-                                 const int family, const unsigned short port)
+                                 struct net *net, const int family,
+                                 const unsigned short port)
 {
        struct svc_xprt *xprt;
-        xprt = svc_find_xprt(serv, name, family, 0);
+        xprt = svc_find_xprt(serv, name, net, family, 0);
        if (xprt == NULL)
-                return svc_create_xprt(serv, name, &init_net, family, port,
+                return svc_create_xprt(serv, name, net, family, port,
                                                SVC_SOCK_DEFAULTS);
        svc_xprt_put(xprt);
        return 0;
 }
-static int create_lockd_family(struct svc_serv *serv, const int family)
+static int create_lockd_family(struct svc_serv *serv, struct net *net,
+                               const int family)
 {
        int err;
-        err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+        err = create_lockd_listener(serv, "udp", net, family, nlm_udpport);
        if (err < 0)
                return err;
-        return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+        return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport);
 }
 /*
@@ -222,16 +228,16 @@ static int create_lockd_family(struct svc_serv *serv, const int family)
 * Returns zero if all listeners are available; otherwise a
 * negative errno value is returned.
 */
-static int make_socks(struct svc_serv *serv)
+static int make_socks(struct svc_serv *serv, struct net *net)
 {
        static int warned;
        int err;
-        err = create_lockd_family(serv, PF_INET);
+        err = create_lockd_family(serv, net, PF_INET);
        if (err < 0)
                goto out_err;
-        err = create_lockd_family(serv, PF_INET6);
+        err = create_lockd_family(serv, net, PF_INET6);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_err;
@@ -245,6 +251,47 @@ out_err:
        return err;
 }
+static int lockd_up_net(struct net *net)
+{
+        struct lockd_net *ln = net_generic(net, lockd_net_id);
+        struct svc_serv *serv = nlmsvc_rqst->rq_server;
+        int error;
+        if (ln->nlmsvc_users)
+                return 0;
+        error = svc_rpcb_setup(serv, net);
+        if (error)
+                goto err_rpcb;
+        error = make_socks(serv, net);
+        if (error < 0)
+                goto err_socks;
+        return 0;
+err_socks:
+        svc_rpcb_cleanup(serv, net);
+err_rpcb:
+        return error;
+}
+static void lockd_down_net(struct net *net)
+{
+        struct lockd_net *ln = net_generic(net, lockd_net_id);
+        struct svc_serv *serv = nlmsvc_rqst->rq_server;
+        if (ln->nlmsvc_users) {
+                if (--ln->nlmsvc_users == 0) {
+                        nlm_shutdown_hosts_net(net);
+                        svc_shutdown_net(serv, net);
+                }
+        } else {
+                printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
+                                nlmsvc_task, net);
+                BUG();
+        }
+}
 /*
 * Bring up the lockd process if it's not already up.
 */
@@ -252,13 +299,16 @@ int lockd_up(void)
 {
        struct svc_serv *serv;
        int             error = 0;
+        struct net *net = current->nsproxy->net_ns;
        mutex_lock(&nlmsvc_mutex);
        /*
         * Check whether we're already up and running.
         */
-        if (nlmsvc_rqst)
+        if (nlmsvc_rqst) {
+                error = lockd_up_net(net);
                goto out;
+        }
        /*
         * Sanity check: if there's no pid,
@@ -275,7 +325,7 @@ int lockd_up(void)
                goto out;
        }
-        error = make_socks(serv);
+        error = make_socks(serv, net);
        if (error < 0)
                goto destroy_and_out;
@@ -313,8 +363,12 @@ int lockd_up(void)
 destroy_and_out:
        svc_destroy(serv);
 out:
-        if (!error)
+        if (!error) {
+                struct lockd_net *ln = net_generic(net, lockd_net_id);
+                ln->nlmsvc_users++;
                nlmsvc_users++;
+        }
        mutex_unlock(&nlmsvc_mutex);
        return error;
 }
@@ -328,8 +382,10 @@ lockd_down(void)
 {
        mutex_lock(&nlmsvc_mutex);
        if (nlmsvc_users) {
-                if (--nlmsvc_users)
+                if (--nlmsvc_users) {
+                        lockd_down_net(current->nsproxy->net_ns);
                        goto out;
+                }
        } else {
                printk(KERN_ERR "lockd_down: no users! task=%p\n",
                        nlmsvc_task);
@@ -497,24 +553,55 @@ module_param_call(nlm_tcpport, param_set_port, param_get_int,
 module_param(nsm_use_hostnames, bool, 0644);
 module_param(nlm_max_connections, uint, 0644);
+static int lockd_init_net(struct net *net)
+{
+        return 0;
+}
+static void lockd_exit_net(struct net *net)
+{
+}
+static struct pernet_operations lockd_net_ops = {
+        .init = lockd_init_net,
+        .exit = lockd_exit_net,
+        .id = &lockd_net_id,
+        .size = sizeof(struct lockd_net),
+};
 /*
 * Initialising and terminating the module.
 */
 static int __init init_nlm(void)
 {
+        int err;
 #ifdef CONFIG_SYSCTL
+        err = -ENOMEM;
        nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
-        return nlm_sysctl_table ? 0 : -ENOMEM;
+        if (nlm_sysctl_table == NULL)
-#else
+                goto err_sysctl;
+#endif
+        err = register_pernet_subsys(&lockd_net_ops);
+        if (err)
+                goto err_pernet;
        return 0;
+err_pernet:
+#ifdef CONFIG_SYSCTL
+        unregister_sysctl_table(nlm_sysctl_table);
 #endif
+err_sysctl:
+        return err;
 }
 static void __exit exit_nlm(void)
 {
        /* FIXME: delete all NLM clients */
        nlm_shutdown_hosts();
+        unregister_pernet_subsys(&lockd_net_ops);
 #ifdef CONFIG_SYSCTL
        unregister_sysctl_table(nlm_sysctl_table);
 #endif
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index f0179c3745d2..e46353f41a42 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,7 +46,6 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
-static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 /*
 * The list of blocked locks to retry
@@ -54,6 +53,35 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 static LIST_HEAD(nlm_blocked);
 static DEFINE_SPINLOCK(nlm_blocked_lock);
+#ifdef LOCKD_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+        /*
+         * We can get away with a static buffer because we're only
+         * called with BKL held.
+         */
+        static char buf[2*NLM_MAXCOOKIELEN+1];
+        unsigned int i, len = sizeof(buf);
+        char *p = buf;
+        len--;  /* allow for trailing \0 */
+        if (len < 3)
+                return "???";
+        for (i = 0 ; i < cookie->len ; i++) {
+                if (len < 2) {
+                        strcpy(p-3, "...");
+                        break;
+                }
+                sprintf(p, "%02x", cookie->data[i]);
+                p += 2;
+                len -= 2;
+        }
+        *p = '\0';
+        return buf;
+}
+#endif
 /*
 * Insert a blocked lock into the global list
 */
@@ -935,32 +963,3 @@ nlmsvc_retry_blocked(void)
        return timeout;
 }
-#ifdef RPC_DEBUG
-static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-        /*
-         * We can get away with a static buffer because we're only
-         * called with BKL held.
-         */
-        static char buf[2*NLM_MAXCOOKIELEN+1];
-        unsigned int i, len = sizeof(buf);
-        char *p = buf;
-        len--;  /* allow for trailing \0 */
-        if (len < 3)
-                return "???";
-        for (i = 0 ; i < cookie->len ; i++) {
-                if (len < 2) {
-                        strcpy(p-3, "...");
-                        break;
-                }
-                sprintf(p, "%02x", cookie->data[i]);
-                p += 2;
-                len -= 2;
-        }
-        *p = '\0';
-        return buf;
-}
-#endif
diff --git a/fs/mpage.c b/fs/mpage.c
index 643e9f55ef29..0face1c4d4c6 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -13,7 +13,7 @@
 */
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/kdev_t.h>
 #include <linux/gfp.h>
diff --git a/fs/namei.c b/fs/namei.c
index 73ec863a9896..e615ff37e27d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -15,7 +15,7 @@
 */
 #include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index dbcd82126aed..2a0e6c599147 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -64,6 +64,7 @@ config NFS_V4
        bool "NFS client support for NFS version 4"
        depends on NFS_FS
        select SUNRPC_GSS
+        select KEYS
        help
          This option enables support for version 4 of the NFS protocol
          (RFC 3530) in the kernel's NFS client.
@@ -98,6 +99,18 @@ config PNFS_OBJLAYOUT
        depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
        default m
+config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
+        string "NFSv4.1 Implementation ID Domain"
+        depends on NFS_V4_1
+        default "kernel.org"
+        help
+          This option defines the domain portion of the implementation ID that
+          may be sent in the NFS exchange_id operation.  The value must be in
+          the format of a DNS domain name and should be set to the DNS domain
+          name of the distribution.
+          If the NFS client is unchanged from the upstream kernel, this
+          option should be set to the default "kernel.org".
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
@@ -130,16 +143,10 @@ config NFS_USE_KERNEL_DNS
        bool
        depends on NFS_V4 && !NFS_USE_LEGACY_DNS
        select DNS_RESOLVER
-        select KEYS
        default y
-config NFS_USE_NEW_IDMAPPER
+config NFS_DEBUG
-        bool "Use the new idmapper upcall routine"
+        bool
-        depends on NFS_V4 && KEYS
+        depends on NFS_FS && SUNRPC_DEBUG
-        help
+        select CRC32
-          Say Y here if you want NFS to use the new idmapper upcall functions.
+        default y
-          You will need /sbin/request-key (usually provided by the keyutils
-          package).  For details, read
-          <file:Documentation/filesystems/nfs/idmapper.txt>.
-          If you are unsure, say N.
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 48cfac31f64c..9c94297bb70e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -46,9 +46,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
-struct dentry *bl_device_pipe;
-wait_queue_head_t bl_wq;
 static void print_page(struct page *page)
 {
        dprintk("PRINTPAGE page %p\n", page);
@@ -236,12 +233,11 @@ bl_read_pagelist(struct nfs_read_data *rdata)
        sector_t isect, extent_length = 0;
        struct parallel_io *par;
        loff_t f_offset = rdata->args.offset;
-        size_t count = rdata->args.count;
        struct page **pages = rdata->args.pages;
        int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
-        dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
+        dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-               rdata->npages, f_offset, count);
+               rdata->npages, f_offset, (unsigned int)rdata->args.count);
        par = alloc_parallel(rdata);
        if (!par)
@@ -1025,10 +1021,128 @@ static const struct rpc_pipe_ops bl_upcall_ops = {
        .destroy_msg    = bl_pipe_destroy_msg,
 };
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+                                            struct rpc_pipe *pipe)
+{
+        struct dentry *dir, *dentry;
+        dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+        if (dir == NULL)
+                return ERR_PTR(-ENOENT);
+        dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+        dput(dir);
+        return dentry;
+}
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+                                          struct rpc_pipe *pipe)
+{
+        if (pipe->dentry)
+                rpc_unlink(pipe->dentry);
+}
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+                           void *ptr)
+{
+        struct super_block *sb = ptr;
+        struct net *net = sb->s_fs_info;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct dentry *dentry;
+        int ret = 0;
+        if (!try_module_get(THIS_MODULE))
+                return 0;
+        if (nn->bl_device_pipe == NULL) {
+                module_put(THIS_MODULE);
+                return 0;
+        }
+        switch (event) {
+        case RPC_PIPEFS_MOUNT:
+                dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+                if (IS_ERR(dentry)) {
+                        ret = PTR_ERR(dentry);
+                        break;
+                }
+                nn->bl_device_pipe->dentry = dentry;
+                break;
+        case RPC_PIPEFS_UMOUNT:
+                if (nn->bl_device_pipe->dentry)
+                        nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+                break;
+        default:
+                ret = -ENOTSUPP;
+                break;
+        }
+        module_put(THIS_MODULE);
+        return ret;
+}
+static struct notifier_block nfs4blocklayout_block = {
+        .notifier_call = rpc_pipefs_event,
+};
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+                                                   struct rpc_pipe *pipe)
+{
+        struct super_block *pipefs_sb;
+        struct dentry *dentry;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (!pipefs_sb)
+                return NULL;
+        dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+        rpc_put_sb_net(net);
+        return dentry;
+}
+static void nfs4blocklayout_unregister_net(struct net *net,
+                                           struct rpc_pipe *pipe)
+{
+        struct super_block *pipefs_sb;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (pipefs_sb) {
+                nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+                rpc_put_sb_net(net);
+        }
+}
+static int nfs4blocklayout_net_init(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct dentry *dentry;
+        init_waitqueue_head(&nn->bl_wq);
+        nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+        if (IS_ERR(nn->bl_device_pipe))
+                return PTR_ERR(nn->bl_device_pipe);
+        dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+        if (IS_ERR(dentry)) {
+                rpc_destroy_pipe_data(nn->bl_device_pipe);
+                return PTR_ERR(dentry);
+        }
+        nn->bl_device_pipe->dentry = dentry;
+        return 0;
+}
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+        rpc_destroy_pipe_data(nn->bl_device_pipe);
+        nn->bl_device_pipe = NULL;
+}
+static struct pernet_operations nfs4blocklayout_net_ops = {
+        .init = nfs4blocklayout_net_init,
+        .exit = nfs4blocklayout_net_exit,
+};
 static int __init nfs4blocklayout_init(void)
 {
-        struct vfsmount *mnt;
-        struct path path;
        int ret;
        dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
@@ -1037,32 +1151,17 @@ static int __init nfs4blocklayout_init(void)
        if (ret)
                goto out;
-        init_waitqueue_head(&bl_wq);
+        ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+        if (ret)
-        mnt = rpc_get_mount();
-        if (IS_ERR(mnt)) {
-                ret = PTR_ERR(mnt);
                goto out_remove;
-        }
+        ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
-        ret = vfs_path_lookup(mnt->mnt_root,
-                              mnt,
-                              NFS_PIPE_DIRNAME, 0, &path);
        if (ret)
-                goto out_putrpc;
+                goto out_notifier;
-        bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
-                                    &bl_upcall_ops, 0);
-        path_put(&path);
-        if (IS_ERR(bl_device_pipe)) {
-                ret = PTR_ERR(bl_device_pipe);
-                goto out_putrpc;
-        }
 out:
        return ret;
-out_putrpc:
+out_notifier:
-        rpc_put_mount();
+        rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
 out_remove:
        pnfs_unregister_layoutdriver(&blocklayout_type);
        return ret;
@@ -1073,9 +1172,9 @@ static void __exit nfs4blocklayout_exit(void)
        dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
               __func__);
+        rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+        unregister_pernet_subsys(&nfs4blocklayout_net_ops);
        pnfs_unregister_layoutdriver(&blocklayout_type);
-        rpc_unlink(bl_device_pipe);
-        rpc_put_mount();
 }
 MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index e31a2df28e70..03350690118e 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -37,6 +37,7 @@
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include "../pnfs.h"
+#include "../netns.h"
 #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
@@ -50,6 +51,7 @@ struct pnfs_block_dev {
        struct list_head                bm_node;
        struct nfs4_deviceid            bm_mdevid;    /* associated devid */
        struct block_device             *bm_mdev;     /* meta device itself */
+        struct net                      *net;
 };
 enum exstate4 {
@@ -151,9 +153,9 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
        return BLK_LO2EXT(lseg->pls_layout);
 }
-struct bl_dev_msg {
+struct bl_pipe_msg {
-        int32_t status;
+        struct rpc_pipe_msg msg;
-        uint32_t major, minor;
+        wait_queue_head_t *bl_wq;
 };
 struct bl_msg_hdr {
@@ -161,9 +163,6 @@ struct bl_msg_hdr {
        u16 totallen; /* length of entire message, including hdr itself */
 };
-extern struct dentry *bl_device_pipe;
-extern wait_queue_head_t bl_wq;
 #define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
 #define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/
 #define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index d08ba9107fde..a5c88a554d92 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -46,7 +46,7 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
        *rp = xdr_decode_hyper(*rp, &s);
        if (s & 0x1ff) {
-                printk(KERN_WARNING "%s: sector not aligned\n", __func__);
+                printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
                return -1;
        }
        *sp = s >> SECTOR_SHIFT;
@@ -79,27 +79,30 @@ int nfs4_blkdev_put(struct block_device *bdev)
        return blkdev_put(bdev, FMODE_READ);
 }
-static struct bl_dev_msg bl_mount_reply;
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
                         size_t mlen)
 {
+        struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+                                         nfs_net_id);
        if (mlen != sizeof (struct bl_dev_msg))
                return -EINVAL;
-        if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
+        if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
                return -EFAULT;
-        wake_up(&bl_wq);
+        wake_up(&nn->bl_wq);
        return mlen;
 }
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 {
+        struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
        if (msg->errno >= 0)
                return;
-        wake_up(&bl_wq);
+        wake_up(bl_pipe_msg->bl_wq);
 }
 /*
@@ -111,29 +114,33 @@ nfs4_blk_decode_device(struct nfs_server *server,
 {
        struct pnfs_block_dev *rv;
        struct block_device *bd = NULL;
-        struct rpc_pipe_msg msg;
+        struct bl_pipe_msg bl_pipe_msg;
+        struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
        struct bl_msg_hdr bl_msg = {
                .type = BL_DEVICE_MOUNT,
                .totallen = dev->mincount,
        };
        uint8_t *dataptr;
        DECLARE_WAITQUEUE(wq, current);
-        struct bl_dev_msg *reply = &bl_mount_reply;
        int offset, len, i, rc;
+        struct net *net = server->nfs_client->net;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct bl_dev_msg *reply = &nn->bl_mount_reply;
        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
        dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
                dev->mincount);
-        memset(&msg, 0, sizeof(msg));
+        bl_pipe_msg.bl_wq = &nn->bl_wq;
-        msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+        memset(msg, 0, sizeof(*msg));
-        if (!msg.data) {
+        msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+        if (!msg->data) {
                rv = ERR_PTR(-ENOMEM);
                goto out;
        }
-        memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+        memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-        dataptr = (uint8_t *) msg.data;
+        dataptr = (uint8_t *) msg->data;
        len = dev->mincount;
        offset = sizeof(bl_msg);
        for (i = 0; len > 0; i++) {
@@ -142,13 +149,13 @@ nfs4_blk_decode_device(struct nfs_server *server,
                len -= PAGE_CACHE_SIZE;
                offset += PAGE_CACHE_SIZE;
        }
-        msg.len = sizeof(bl_msg) + dev->mincount;
+        msg->len = sizeof(bl_msg) + dev->mincount;
        dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
-        add_wait_queue(&bl_wq, &wq);
+        add_wait_queue(&nn->bl_wq, &wq);
-        rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
+        rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
        if (rc < 0) {
-                remove_wait_queue(&bl_wq, &wq);
+                remove_wait_queue(&nn->bl_wq, &wq);
                rv = ERR_PTR(rc);
                goto out;
        }
@@ -156,7 +163,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
        set_current_state(TASK_UNINTERRUPTIBLE);
        schedule();
        __set_current_state(TASK_RUNNING);
-        remove_wait_queue(&bl_wq, &wq);
+        remove_wait_queue(&nn->bl_wq, &wq);
        if (reply->status != BL_DEVICE_REQUEST_PROC) {
                dprintk("%s failed to open device: %d\n",
@@ -181,13 +188,14 @@ nfs4_blk_decode_device(struct nfs_server *server,
        rv->bm_mdev = bd;
        memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+        rv->net = net;
        dprintk("%s Created device %s with bd_block_size %u\n",
                __func__,
                bd->bd_disk->disk_name,
                bd->bd_block_size);
 out:
-        kfree(msg.data);
+        kfree(msg->data);
        return rv;
 }
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index d055c7558073..737d839bc17b 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -38,9 +38,10 @@
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-static void dev_remove(dev_t dev)
+static void dev_remove(struct net *net, dev_t dev)
 {
-        struct rpc_pipe_msg msg;
+        struct bl_pipe_msg bl_pipe_msg;
+        struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
        struct bl_dev_msg bl_umount_request;
        struct bl_msg_hdr bl_msg = {
                .type = BL_DEVICE_UMOUNT,
@@ -48,36 +49,38 @@ static void dev_remove(dev_t dev)
        };
        uint8_t *dataptr;
        DECLARE_WAITQUEUE(wq, current);
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
        dprintk("Entering %s\n", __func__);
-        memset(&msg, 0, sizeof(msg));
+        bl_pipe_msg.bl_wq = &nn->bl_wq;
-        msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+        memset(msg, 0, sizeof(*msg));
-        if (!msg.data)
+        msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+        if (!msg->data)
                goto out;
        memset(&bl_umount_request, 0, sizeof(bl_umount_request));
        bl_umount_request.major = MAJOR(dev);
        bl_umount_request.minor = MINOR(dev);
-        memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+        memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-        dataptr = (uint8_t *) msg.data;
+        dataptr = (uint8_t *) msg->data;
        memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-        msg.len = sizeof(bl_msg) + bl_msg.totallen;
+        msg->len = sizeof(bl_msg) + bl_msg.totallen;
-        add_wait_queue(&bl_wq, &wq);
+        add_wait_queue(&nn->bl_wq, &wq);
-        if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+        if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
-                remove_wait_queue(&bl_wq, &wq);
+                remove_wait_queue(&nn->bl_wq, &wq);
                goto out;
        }
        set_current_state(TASK_UNINTERRUPTIBLE);
        schedule();
        __set_current_state(TASK_RUNNING);
-        remove_wait_queue(&bl_wq, &wq);
+        remove_wait_queue(&nn->bl_wq, &wq);
 out:
-        kfree(msg.data);
+        kfree(msg->data);
 }
 /*
@@ -90,10 +93,10 @@ static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
        dprintk("%s Releasing\n", __func__);
        rv = nfs4_blkdev_put(bdev->bm_mdev);
        if (rv)
-                printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
+                printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",
                                __func__, rv);
-        dev_remove(bdev->bm_mdev->bd_dev);
+        dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
 }
 void bl_free_block_dev(struct pnfs_block_dev *bdev)
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 1abac09f7cd5..1f9a6032796b 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -147,7 +147,7 @@ static int _preload_range(struct pnfs_inval_markings *marks,
        count = (int)(end - start) / (int)tree->mtt_step_size;
        /* Pre-malloc what memory we might need */
-        storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
+        storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
        if (!storage)
                return -ENOMEM;
        for (i = 0; i < count; i++) {
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index c98b439332fc..dded26368111 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
+#include <net/net_namespace.h>
 #include "cache_lib.h"
@@ -111,30 +112,54 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
        return 0;
 }
-int nfs_cache_register(struct cache_detail *cd)
+int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
 {
-        struct vfsmount *mnt;
-        struct path path;
        int ret;
+        struct dentry *dir;
-        mnt = rpc_get_mount();
+        dir = rpc_d_lookup_sb(sb, "cache");
-        if (IS_ERR(mnt))
+        BUG_ON(dir == NULL);
-                return PTR_ERR(mnt);
+        ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
-        ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path);
+        dput(dir);
-        if (ret)
-                goto err;
-        ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd);
-        path_put(&path);
-        if (!ret)
-                return ret;
-err:
-        rpc_put_mount();
        return ret;
 }
-void nfs_cache_unregister(struct cache_detail *cd)
+int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
 {
-        sunrpc_cache_unregister_pipefs(cd);
+        struct super_block *pipefs_sb;
-        rpc_put_mount();
+        int ret = 0;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (pipefs_sb) {
+                ret = nfs_cache_register_sb(pipefs_sb, cd);
+                rpc_put_sb_net(net);
+        }
+        return ret;
+}
+void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
+{
+        if (cd->u.pipefs.dir)
+                sunrpc_cache_unregister_pipefs(cd);
+}
+void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
+{
+        struct super_block *pipefs_sb;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (pipefs_sb) {
+                nfs_cache_unregister_sb(pipefs_sb, cd);
+                rpc_put_sb_net(net);
+        }
+}
+void nfs_cache_init(struct cache_detail *cd)
+{
+        sunrpc_init_cache_detail(cd);
 }
+void nfs_cache_destroy(struct cache_detail *cd)
+{
+        sunrpc_destroy_cache_detail(cd);
+}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 7cf6cafcc007..317db95e37f8 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -23,5 +23,11 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
 extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
 extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
-extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_init(struct cache_detail *cd);
-extern void nfs_cache_unregister(struct cache_detail *cd);
+extern void nfs_cache_destroy(struct cache_detail *cd);
+extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
+extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
+extern int nfs_cache_register_sb(struct super_block *sb,
+                                 struct cache_detail *cd);
+extern void nfs_cache_unregister_sb(struct super_block *sb,
+                                    struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 516f3375e067..eb95f5091c1a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -85,7 +85,7 @@ nfs4_callback_svc(void *vrqstp)
                }
                if (err < 0) {
                        if (err != preverr) {
-                                printk(KERN_WARNING "%s: unexpected error "
+                                printk(KERN_WARNING "NFS: %s: unexpected error "
                                        "from svc_recv (%d)\n", __func__, err);
                                preverr = err;
                        }
@@ -101,12 +101,12 @@ nfs4_callback_svc(void *vrqstp)
 /*
 * Prepare to bring up the NFSv4 callback service
 */
-struct svc_rqst *
+static struct svc_rqst *
-nfs4_callback_up(struct svc_serv *serv)
+nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
        int ret;
-        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
+        ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_err;
@@ -114,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv)
        dprintk("NFS: Callback listener port = %u (af %u)\n",
                        nfs_callback_tcpport, PF_INET);
-        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
+        ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET6,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret > 0) {
                nfs_callback_tcpport6 = ret;
@@ -172,7 +172,7 @@ nfs41_callback_svc(void *vrqstp)
 /*
 * Bring up the NFSv4.1 callback service
 */
-struct svc_rqst *
+static struct svc_rqst *
 nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
        struct svc_rqst *rqstp;
@@ -183,7 +183,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
         * fore channel connection.
         * Returns the input port (0) and sets the svc_serv bc_xprt on success
         */
-        ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+        ret = svc_create_xprt(serv, "tcp-bc", xprt->xprt_net, PF_INET, 0,
                              SVC_SOCK_ANONYMOUS);
        if (ret < 0) {
                rqstp = ERR_PTR(ret);
@@ -269,7 +269,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
                                        serv, xprt, &rqstp, &callback_svc);
        if (!minorversion_setup) {
                /* v4.0 callback setup */
-                rqstp = nfs4_callback_up(serv);
+                rqstp = nfs4_callback_up(serv, xprt);
                callback_svc = nfs4_callback_svc;
        }
@@ -332,7 +332,6 @@ void nfs_callback_down(int minorversion)
 int
 check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 {
-        struct rpc_clnt *r = clp->cl_rpcclient;
        char *p = svc_gss_principal(rqstp);
        if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
@@ -353,7 +352,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
        if (memcmp(p, "nfs@", 4) != 0)
                return 0;
        p += 4;
-        if (strcmp(p, r->cl_server) != 0)
+        if (strcmp(p, clp->cl_hostname) != 0)
                return 0;
        return 1;
 }
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index c89d3b9e483c..a5527c90a5aa 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,8 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
-        int                     slotid;
+        u32                     slotid;
+        struct net              *net;
 };
 struct cb_compound_hdr_arg {
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 54cea8ad5a76..1b5d809a105e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -8,6 +8,7 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/rcupdate.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
@@ -33,7 +34,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
-        dprintk("NFS: GETATTR callback request from %s\n",
+        dprintk_rcu("NFS: GETATTR callback request from %s\n",
                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
@@ -73,7 +74,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
                goto out;
-        dprintk("NFS: RECALL callback request from %s\n",
+        dprintk_rcu("NFS: RECALL callback request from %s\n",
                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
        res = htonl(NFS4ERR_BADHANDLE);
@@ -86,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
                res = 0;
                break;
        case -ENOENT:
-                if (res != 0)
+                res = htonl(NFS4ERR_BAD_STATEID);
-                        res = htonl(NFS4ERR_BAD_STATEID);
                break;
        default:
                res = htonl(NFS4ERR_RESOURCE);
@@ -98,52 +98,64 @@ out:
        return res;
 }
-int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
-{
-        if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
-                                         sizeof(delegation->stateid.data)) != 0)
-                return 0;
-        return 1;
-}
 #if defined(CONFIG_NFS_V4_1)
-static u32 initiate_file_draining(struct nfs_client *clp,
+/*
-                                  struct cb_layoutrecallargs *args)
+ * Lookup a layout by filehandle.
+ *
+ * Note: gets a refcount on the layout hdr and on its respective inode.
+ * Caller must put the layout hdr and the inode.
+ *
+ * TODO: keep track of all layouts (and delegations) in a hash table
+ * hashed by filehandle.
+ */
+static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh)
 {
        struct nfs_server *server;
-        struct pnfs_layout_hdr *lo;
        struct inode *ino;
-        bool found = false;
+        struct pnfs_layout_hdr *lo;
-        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
-        LIST_HEAD(free_me_list);
-        spin_lock(&clp->cl_lock);
-        rcu_read_lock();
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
                list_for_each_entry(lo, &server->layouts, plh_layouts) {
-                        if (nfs_compare_fh(&args->cbl_fh,
+                        if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
-                                           &NFS_I(lo->plh_inode)->fh))
                                continue;
                        ino = igrab(lo->plh_inode);
                        if (!ino)
                                continue;
-                        found = true;
-                        /* Without this, layout can be freed as soon
-                         * as we release cl_lock.
-                         */
                        get_layout_hdr(lo);
-                        break;
+                        return lo;
                }
-                if (found)
-                        break;
        }
+        return NULL;
+}
+static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&clp->cl_lock);
+        rcu_read_lock();
+        lo = get_layout_by_fh_locked(clp, fh);
        rcu_read_unlock();
        spin_unlock(&clp->cl_lock);
-        if (!found)
+        return lo;
+}
+static u32 initiate_file_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct inode *ino;
+        struct pnfs_layout_hdr *lo;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        LIST_HEAD(free_me_list);
+        lo = get_layout_by_fh(clp, &args->cbl_fh);
+        if (!lo)
                return NFS4ERR_NOMATCHING_LAYOUT;
+        ino = lo->plh_inode;
        spin_lock(&ino->i_lock);
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
            mark_matching_lsegs_invalid(lo, &free_me_list,
@@ -213,17 +225,13 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
 static u32 do_callback_layoutrecall(struct nfs_client *clp,
                                    struct cb_layoutrecallargs *args)
 {
-        u32 res = NFS4ERR_DELAY;
+        u32 res;
        dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
-        if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
-                goto out;
        if (args->cbl_recall_type == RETURN_FILE)
                res = initiate_file_draining(clp, args);
        else
                res = initiate_bulk_draining(clp, args);
-        clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
-out:
        dprintk("%s returning %i\n", __func__, res);
        return res;
@@ -303,21 +311,6 @@ out:
        return res;
 }
-int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
-{
-        if (delegation == NULL)
-                return 0;
-        if (stateid->stateid.seqid != 0)
-                return 0;
-        if (memcmp(&delegation->stateid.stateid.other,
-                   &stateid->stateid.other,
-                   NFS4_STATEID_OTHER_SIZE))
-                return 0;
-        return 1;
-}
 /*
 * Validate the sequenceID sent by the server.
 * Return success if the sequenceID is one more than what we last saw on
@@ -441,7 +434,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
-        clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
+        clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
        if (clp == NULL)
                goto out;
@@ -517,7 +510,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
        if (!cps->clp) /* set in cb_sequence */
                goto out;
-        dprintk("NFS: RECALL_ANY callback request from %s\n",
+        dprintk_rcu("NFS: RECALL_ANY callback request from %s\n",
                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
        status = cpu_to_be32(NFS4ERR_INVAL);
@@ -552,7 +545,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
        if (!cps->clp) /* set in cb_sequence */
                goto out;
-        dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+        dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
                args->crsa_target_max_slots);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d50b2742f23b..95bfc243992c 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,8 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
@@ -73,7 +75,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
        p = xdr_inline_decode(xdr, nbytes);
        if (unlikely(p == NULL))
-                printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n");
+                printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
        return p;
 }
@@ -138,10 +140,10 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
        __be32 *p;
-        p = read_buf(xdr, 16);
+        p = read_buf(xdr, NFS4_STATEID_SIZE);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
-        memcpy(stateid->data, p, 16);
+        memcpy(stateid, p, NFS4_STATEID_SIZE);
        return 0;
 }
@@ -155,7 +157,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
                return status;
        /* We do not like overly long tags! */
        if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
-                printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
+                printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
                                __func__, hdr->taglen);
                return htonl(NFS4ERR_RESOURCE);
        }
@@ -167,7 +169,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        if (hdr->minorversion <= 1) {
                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
        } else {
-                printk(KERN_WARNING "%s: NFSv4 server callback with "
+                pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
                        __func__, hdr->minorversion);
                return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
@@ -759,14 +761,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
         * Let the state manager know callback processing done.
         * A single slot, so highest used slotid is either 0 or -1
         */
-        tbl->highest_used_slotid = -1;
+        tbl->highest_used_slotid = NFS4_NO_SLOT;
        nfs4_check_drain_bc_complete(session);
        spin_unlock(&tbl->slot_tbl_lock);
 }
 static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
-        if (cps->slotid != -1)
+        if (cps->slotid != NFS4_NO_SLOT)
                nfs4_callback_free_slot(cps->clp->cl_session);
 }
@@ -860,7 +862,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_process_state cps = {
                .drc_status = 0,
                .clp = NULL,
-                .slotid = -1,
+                .slotid = NFS4_NO_SLOT,
+                .net = rqstp->rq_xprt->xpt_net,
        };
        unsigned int nops = 0;
@@ -876,7 +879,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
                return rpc_garbage_args;
        if (hdr_arg.minorversion == 0) {
-                cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+                cps.clp = nfs4_find_client_ident(rqstp->rq_xprt->xpt_net, hdr_arg.cb_ident);
                if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
                        return rpc_drop_reply;
        }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ad5565acbf3b..da7b5e4ff9ec 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -40,6 +40,8 @@
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
 #include "nfs4_fs.h"
@@ -49,15 +51,12 @@
 #include "internal.h"
 #include "fscache.h"
 #include "pnfs.h"
+#include "netns.h"
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
-static DEFINE_SPINLOCK(nfs_client_lock);
-static LIST_HEAD(nfs_client_list);
-static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
 #ifdef CONFIG_NFS_V4
-static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
 /*
 * Get a unique NFSv4.0 callback identifier which will be used
@@ -66,15 +65,16 @@ static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
 static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
 {
        int ret = 0;
+        struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
        if (clp->rpc_ops->version != 4 || minorversion != 0)
                return ret;
 retry:
-        if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+        if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL))
                return -ENOMEM;
-        spin_lock(&nfs_client_lock);
+        spin_lock(&nn->nfs_client_lock);
-        ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+        ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident);
-        spin_unlock(&nfs_client_lock);
+        spin_unlock(&nn->nfs_client_lock);
        if (ret == -EAGAIN)
                goto retry;
        return ret;
@@ -89,7 +89,7 @@ static bool nfs4_disable_idmapping = true;
 /*
 * RPC cruft for NFS
 */
-static struct rpc_version *nfs_version[5] = {
+static const struct rpc_version *nfs_version[5] = {
        [2]                     = &nfs_version2,
 #ifdef CONFIG_NFS_V3
        [3]                     = &nfs_version3,
@@ -99,7 +99,7 @@ static struct rpc_version *nfs_version[5] = {
 #endif
 };
-struct rpc_program nfs_program = {
+const struct rpc_program nfs_program = {
        .name                   = "nfs",
        .number                 = NFS_PROGRAM,
        .nrvers                 = ARRAY_SIZE(nfs_version),
@@ -115,11 +115,11 @@ struct rpc_stat nfs_rpcstat = {
 #ifdef CONFIG_NFS_V3_ACL
 static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version *     nfsacl_version[] = {
+static const struct rpc_version *nfsacl_version[] = {
        [3]                     = &nfsacl_version3,
 };
-struct rpc_program              nfsacl_program = {
+const struct rpc_program nfsacl_program = {
        .name                   = "nfsacl",
        .number                 = NFS_ACL_PROGRAM,
        .nrvers                 = ARRAY_SIZE(nfsacl_version),
@@ -135,6 +135,7 @@ struct nfs_client_initdata {
        const struct nfs_rpc_ops *rpc_ops;
        int proto;
        u32 minorversion;
+        struct net *net;
 };
 /*
@@ -171,6 +172,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_rpcclient = ERR_PTR(-EINVAL);
        clp->cl_proto = cl_init->proto;
+        clp->net = get_net(cl_init->net);
 #ifdef CONFIG_NFS_V4
        err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
@@ -202,8 +204,11 @@ error_0:
 #ifdef CONFIG_NFS_V4_1
 static void nfs4_shutdown_session(struct nfs_client *clp)
 {
-        if (nfs4_has_session(clp))
+        if (nfs4_has_session(clp)) {
+                nfs4_deviceid_purge_client(clp);
                nfs4_destroy_session(clp->cl_session);
+        }
 }
 #else /* CONFIG_NFS_V4_1 */
 static void nfs4_shutdown_session(struct nfs_client *clp)
@@ -233,16 +238,20 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 }
 /* idr_remove_all is not needed as all id's are removed by nfs_put_client */
-void nfs_cleanup_cb_ident_idr(void)
+void nfs_cleanup_cb_ident_idr(struct net *net)
 {
-        idr_destroy(&cb_ident_idr);
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        idr_destroy(&nn->cb_ident_idr);
 }
 /* nfs_client_lock held */
 static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 {
+        struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
        if (clp->cl_cb_ident)
-                idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+                idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
 }
 static void pnfs_init_server(struct nfs_server *server)
@@ -260,7 +269,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
-void nfs_cleanup_cb_ident_idr(void)
+void nfs_cleanup_cb_ident_idr(struct net *net)
 {
 }
@@ -292,10 +301,10 @@ static void nfs_free_client(struct nfs_client *clp)
        if (clp->cl_machine_cred != NULL)
                put_rpccred(clp->cl_machine_cred);
-        nfs4_deviceid_purge_client(clp);
+        put_net(clp->net);
        kfree(clp->cl_hostname);
        kfree(clp->server_scope);
+        kfree(clp->impl_id);
        kfree(clp);
        dprintk("<-- nfs_free_client()\n");
@@ -306,15 +315,18 @@ static void nfs_free_client(struct nfs_client *clp)
 */
 void nfs_put_client(struct nfs_client *clp)
 {
+        struct nfs_net *nn;
        if (!clp)
                return;
        dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
+        nn = net_generic(clp->net, nfs_net_id);
-        if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
+        if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
                list_del(&clp->cl_share_link);
                nfs_cb_idr_remove_locked(clp);
-                spin_unlock(&nfs_client_lock);
+                spin_unlock(&nn->nfs_client_lock);
                BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -392,6 +404,7 @@ static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
                (sin1->sin_port == sin2->sin_port);
 }
+#if defined(CONFIG_NFS_V4_1)
 /*
 * Test if two socket addresses represent the same actual socket,
 * by comparing (only) relevant fields, excluding the port number.
@@ -410,6 +423,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
        }
        return 0;
 }
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * Test if two socket addresses represent the same actual socket,
@@ -430,10 +444,10 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
        return 0;
 }
+#if defined(CONFIG_NFS_V4_1)
 /* Common match routine for v4.0 and v4.1 callback services */
-bool
+static bool nfs4_cb_match_client(const struct sockaddr *addr,
-nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
+                struct nfs_client *clp, u32 minorversion)
-                     u32 minorversion)
 {
        struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
@@ -453,6 +467,7 @@ nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
        return true;
 }
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * Find an nfs_client on the list that matches the initialisation data
@@ -462,8 +477,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 {
        struct nfs_client *clp;
        const struct sockaddr *sap = data->addr;
+        struct nfs_net *nn = net_generic(data->net, nfs_net_id);
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+        list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
                const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
                /* Don't match clients that failed to initialise properly */
                if (clp->cl_cons_state < 0)
@@ -501,13 +517,14 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 {
        struct nfs_client *clp, *new = NULL;
        int error;
+        struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
        dprintk("--> nfs_get_client(%s,v%u)\n",
                cl_init->hostname ?: "", cl_init->rpc_ops->version);
        /* see if the client already exists */
        do {
-                spin_lock(&nfs_client_lock);
+                spin_lock(&nn->nfs_client_lock);
                clp = nfs_match_client(cl_init);
                if (clp)
@@ -515,7 +532,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
                if (new)
                        goto install_client;
-                spin_unlock(&nfs_client_lock);
+                spin_unlock(&nn->nfs_client_lock);
                new = nfs_alloc_client(cl_init);
        } while (!IS_ERR(new));
@@ -526,8 +543,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
        /* install a new client and return with it unready */
 install_client:
        clp = new;
-        list_add(&clp->cl_share_link, &nfs_client_list);
+        list_add(&clp->cl_share_link, &nn->nfs_client_list);
-        spin_unlock(&nfs_client_lock);
+        spin_unlock(&nn->nfs_client_lock);
        error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
                                              authflavour, noresvport);
@@ -542,7 +559,7 @@ install_client:
         * - make sure it's ready before returning
         */
 found_client:
-        spin_unlock(&nfs_client_lock);
+        spin_unlock(&nn->nfs_client_lock);
        if (new)
                nfs_free_client(new);
@@ -642,7 +659,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 {
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
-                .net            = &init_net,
+                .net            = clp->net,
                .protocol       = clp->cl_proto,
                .address        = (struct sockaddr *)&clp->cl_addr,
                .addrsize       = clp->cl_addrlen,
@@ -696,6 +713,7 @@ static int nfs_start_lockd(struct nfs_server *server)
                .nfs_version    = clp->rpc_ops->version,
                .noresvport     = server->flags & NFS_MOUNT_NORESVPORT ?
                                        1 : 0,
+                .net            = clp->net,
        };
        if (nlm_init.nfs_version > 3)
@@ -831,6 +849,7 @@ static int nfs_init_server(struct nfs_server *server,
                .addrlen = data->nfs_server.addrlen,
                .rpc_ops = &nfs_v2_clientops,
                .proto = data->nfs_server.protocol,
+                .net = data->net,
        };
        struct rpc_timeout timeparms;
        struct nfs_client *clp;
@@ -1029,25 +1048,30 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
 static void nfs_server_insert_lists(struct nfs_server *server)
 {
        struct nfs_client *clp = server->nfs_client;
+        struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
-        spin_lock(&nfs_client_lock);
+        spin_lock(&nn->nfs_client_lock);
        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
+        list_add_tail(&server->master_link, &nn->nfs_volume_list);
        clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
-        spin_unlock(&nfs_client_lock);
+        spin_unlock(&nn->nfs_client_lock);
 }
 static void nfs_server_remove_lists(struct nfs_server *server)
 {
        struct nfs_client *clp = server->nfs_client;
+        struct nfs_net *nn;
-        spin_lock(&nfs_client_lock);
+        if (clp == NULL)
+                return;
+        nn = net_generic(clp->net, nfs_net_id);
+        spin_lock(&nn->nfs_client_lock);
        list_del_rcu(&server->client_link);
-        if (clp && list_empty(&clp->cl_superblocks))
+        if (list_empty(&clp->cl_superblocks))
                set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
        list_del(&server->master_link);
-        spin_unlock(&nfs_client_lock);
+        spin_unlock(&nn->nfs_client_lock);
        synchronize_rcu();
 }
@@ -1086,6 +1110,8 @@ static struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
+        ida_init(&server->openowner_id);
+        ida_init(&server->lockowner_id);
        pnfs_init_server(server);
        return server;
@@ -1111,6 +1137,8 @@ void nfs_free_server(struct nfs_server *server)
        nfs_put_client(server->nfs_client);
+        ida_destroy(&server->lockowner_id);
+        ida_destroy(&server->openowner_id);
        nfs_free_iostats(server->io_stats);
        bdi_destroy(&server->backing_dev_info);
        kfree(server);
@@ -1189,45 +1217,19 @@ error:
 /*
 * NFSv4.0 callback thread helper
 *
- * Find a client by IP address, protocol version, and minorversion
- *
- * Called from the pg_authenticate method. The callback identifier
- * is not used as it has not been decoded.
- *
- * Returns NULL if no such client
- */
-struct nfs_client *
-nfs4_find_client_no_ident(const struct sockaddr *addr)
-{
-        struct nfs_client *clp;
-        spin_lock(&nfs_client_lock);
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
-                if (nfs4_cb_match_client(addr, clp, 0) == false)
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
-}
-/*
- * NFSv4.0 callback thread helper
- *
 * Find a client by callback identifier
 */
 struct nfs_client *
-nfs4_find_client_ident(int cb_ident)
+nfs4_find_client_ident(struct net *net, int cb_ident)
 {
        struct nfs_client *clp;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        spin_lock(&nfs_client_lock);
+        spin_lock(&nn->nfs_client_lock);
-        clp = idr_find(&cb_ident_idr, cb_ident);
+        clp = idr_find(&nn->cb_ident_idr, cb_ident);
        if (clp)
                atomic_inc(&clp->cl_count);
-        spin_unlock(&nfs_client_lock);
+        spin_unlock(&nn->nfs_client_lock);
        return clp;
 }
@@ -1240,13 +1242,14 @@ nfs4_find_client_ident(int cb_ident)
 * Returns NULL if no such client
 */
 struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *addr,
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
                           struct nfs4_sessionid *sid)
 {
        struct nfs_client *clp;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        spin_lock(&nfs_client_lock);
+        spin_lock(&nn->nfs_client_lock);
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+        list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
                if (nfs4_cb_match_client(addr, clp, 1) == false)
                        continue;
@@ -1259,17 +1262,17 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
                        continue;
                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
+                spin_unlock(&nn->nfs_client_lock);
                return clp;
        }
-        spin_unlock(&nfs_client_lock);
+        spin_unlock(&nn->nfs_client_lock);
        return NULL;
 }
 #else /* CONFIG_NFS_V4_1 */
 struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *addr,
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
                           struct nfs4_sessionid *sid)
 {
        return NULL;
@@ -1284,16 +1287,18 @@ static int nfs4_init_callback(struct nfs_client *clp)
        int error;
        if (clp->rpc_ops->version == 4) {
+                struct rpc_xprt *xprt;
+                xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
                if (nfs4_has_session(clp)) {
-                        error = xprt_setup_backchannel(
+                        error = xprt_setup_backchannel(xprt,
-                                                clp->cl_rpcclient->cl_xprt,
                                                NFS41_BC_MIN_CALLBACKS);
                        if (error < 0)
                                return error;
                }
-                error = nfs_callback_up(clp->cl_mvops->minor_version,
+                error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
-                                        clp->cl_rpcclient->cl_xprt);
                if (error < 0) {
                        dprintk("%s: failed to start callback. Error = %d\n",
                                __func__, error);
@@ -1344,6 +1349,7 @@ int nfs4_init_client(struct nfs_client *clp,
                     rpc_authflavor_t authflavour,
                     int noresvport)
 {
+        char buf[INET6_ADDRSTRLEN + 1];
        int error;
        if (clp->cl_cons_state == NFS_CS_READY) {
@@ -1359,6 +1365,20 @@ int nfs4_init_client(struct nfs_client *clp,
                                      1, noresvport);
        if (error < 0)
                goto error;
+        /* If no clientaddr= option was specified, find a usable cb address */
+        if (ip_addr == NULL) {
+                struct sockaddr_storage cb_addr;
+                struct sockaddr *sap = (struct sockaddr *)&cb_addr;
+                error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
+                if (error < 0)
+                        goto error;
+                error = rpc_ntop(sap, buf, sizeof(buf));
+                if (error < 0)
+                        goto error;
+                ip_addr = (const char *)buf;
+        }
        strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
        error = nfs_idmap_new(clp);
@@ -1393,7 +1413,7 @@ static int nfs4_set_client(struct nfs_server *server,
                const char *ip_addr,
                rpc_authflavor_t authflavour,
                int proto, const struct rpc_timeout *timeparms,
-                u32 minorversion)
+                u32 minorversion, struct net *net)
 {
        struct nfs_client_initdata cl_init = {
                .hostname = hostname,
@@ -1402,6 +1422,7 @@ static int nfs4_set_client(struct nfs_server *server,
                .rpc_ops = &nfs_v4_clientops,
                .proto = proto,
                .minorversion = minorversion,
+                .net = net,
        };
        struct nfs_client *clp;
        int error;
@@ -1453,6 +1474,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
                .rpc_ops = &nfs_v4_clientops,
                .proto = ds_proto,
                .minorversion = mds_clp->cl_minorversion,
+                .net = mds_clp->net,
        };
        struct rpc_timeout ds_timeout = {
                .to_initval = 15 * HZ,
@@ -1580,7 +1602,8 @@ static int nfs4_init_server(struct nfs_server *server,
                        data->auth_flavors[0],
                        data->nfs_server.protocol,
                        &timeparms,
-                        data->minorversion);
+                        data->minorversion,
+                        data->net);
        if (error < 0)
                goto error;
@@ -1675,9 +1698,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
                                data->addrlen,
                                parent_client->cl_ipaddr,
                                data->authflavor,
-                                parent_server->client->cl_xprt->prot,
+                                rpc_protocol(parent_server->client),
                                parent_server->client->cl_timeout,
-                                parent_client->cl_mvops->minor_version);
+                                parent_client->cl_mvops->minor_version,
+                                parent_client->net);
        if (error < 0)
                goto error;
@@ -1770,6 +1794,18 @@ out_free_server:
        return ERR_PTR(error);
 }
+void nfs_clients_init(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        INIT_LIST_HEAD(&nn->nfs_client_list);
+        INIT_LIST_HEAD(&nn->nfs_volume_list);
+#ifdef CONFIG_NFS_V4
+        idr_init(&nn->cb_ident_idr);
+#endif
+        spin_lock_init(&nn->nfs_client_lock);
+}
 #ifdef CONFIG_PROC_FS
 static struct proc_dir_entry *proc_fs_nfs;
@@ -1823,13 +1859,15 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
 {
        struct seq_file *m;
        int ret;
+        struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
+        struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
        ret = seq_open(file, &nfs_server_list_ops);
        if (ret < 0)
                return ret;
        m = file->private_data;
-        m->private = PDE(inode)->data;
+        m->private = net;
        return 0;
 }
@@ -1839,9 +1877,11 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
 */
 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
 {
+        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
        /* lock the list against modification */
-        spin_lock(&nfs_client_lock);
+        spin_lock(&nn->nfs_client_lock);
-        return seq_list_start_head(&nfs_client_list, *_pos);
+        return seq_list_start_head(&nn->nfs_client_list, *_pos);
 }
 /*
@@ -1849,7 +1889,9 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
 */
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-        return seq_list_next(v, &nfs_client_list, pos);
+        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        return seq_list_next(v, &nn->nfs_client_list, pos);
 }
 /*
@@ -1857,7 +1899,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
 */
 static void nfs_server_list_stop(struct seq_file *p, void *v)
 {
-        spin_unlock(&nfs_client_lock);
+        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        spin_unlock(&nn->nfs_client_lock);
 }
 /*
@@ -1866,9 +1910,10 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
 static int nfs_server_list_show(struct seq_file *m, void *v)
 {
        struct nfs_client *clp;
+        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
        /* display header on line 1 */
-        if (v == &nfs_client_list) {
+        if (v == &nn->nfs_client_list) {
                seq_puts(m, "NV SERVER   PORT USE HOSTNAME\n");
                return 0;
        }
@@ -1880,12 +1925,14 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
        if (clp->cl_cons_state != NFS_CS_READY)
                return 0;
+        rcu_read_lock();
        seq_printf(m, "v%u %s %s %3d %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                   atomic_read(&clp->cl_count),
                   clp->cl_hostname);
+        rcu_read_unlock();
        return 0;
 }
@@ -1897,13 +1944,15 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
 {
        struct seq_file *m;
        int ret;
+        struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
+        struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
        ret = seq_open(file, &nfs_volume_list_ops);
        if (ret < 0)
                return ret;
        m = file->private_data;
-        m->private = PDE(inode)->data;
+        m->private = net;
        return 0;
 }
@@ -1913,9 +1962,11 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
 */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
 {
+        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
        /* lock the list against modification */
-        spin_lock(&nfs_client_lock);
+        spin_lock(&nn->nfs_client_lock);
-        return seq_list_start_head(&nfs_volume_list, *_pos);
+        return seq_list_start_head(&nn->nfs_volume_list, *_pos);
 }
 /*
@@ -1923,7 +1974,9 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
 */
 static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-        return seq_list_next(v, &nfs_volume_list, pos);
+        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        return seq_list_next(v, &nn->nfs_volume_list, pos);
 }
 /*
@@ -1931,7 +1984,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
 */
 static void nfs_volume_list_stop(struct seq_file *p, void *v)
 {
-        spin_unlock(&nfs_client_lock);
+        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        spin_unlock(&nn->nfs_client_lock);
 }
 /*
@@ -1942,9 +1997,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
        struct nfs_server *server;
        struct nfs_client *clp;
        char dev[8], fsid[17];
+        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
        /* display header on line 1 */
-        if (v == &nfs_volume_list) {
+        if (v == &nn->nfs_volume_list) {
                seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
                return 0;
        }
@@ -1959,6 +2015,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
                 (unsigned long long) server->fsid.major,
                 (unsigned long long) server->fsid.minor);
+        rcu_read_lock();
        seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
@@ -1966,6 +2023,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
                   dev,
                   fsid,
                   nfs_server_fscache_state(server));
+        rcu_read_unlock();
        return 0;
 }
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f2654069806..89af1d269274 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -105,7 +105,7 @@ again:
                        continue;
                if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
                        continue;
-                if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0)
+                if (!nfs4_stateid_match(&state->stateid, stateid))
                        continue;
                get_nfs_open_context(ctx);
                spin_unlock(&inode->i_lock);
@@ -139,8 +139,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
        if (delegation != NULL) {
                spin_lock(&delegation->lock);
                if (delegation->inode != NULL) {
-                        memcpy(delegation->stateid.data, res->delegation.data,
+                        nfs4_stateid_copy(&delegation->stateid, &res->delegation);
-                               sizeof(delegation->stateid.data));
                        delegation->type = res->delegation_type;
                        delegation->maxsize = res->maxsize;
                        oldcred = delegation->cred;
@@ -236,8 +235,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
        if (delegation == NULL)
                return -ENOMEM;
-        memcpy(delegation->stateid.data, res->delegation.data,
+        nfs4_stateid_copy(&delegation->stateid, &res->delegation);
-                        sizeof(delegation->stateid.data));
        delegation->type = res->delegation_type;
        delegation->maxsize = res->maxsize;
        delegation->change_attr = inode->i_version;
@@ -250,19 +248,22 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        old_delegation = rcu_dereference_protected(nfsi->delegation,
                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
-                if (memcmp(&delegation->stateid, &old_delegation->stateid,
+                if (nfs4_stateid_match(&delegation->stateid,
-                                        sizeof(old_delegation->stateid)) == 0 &&
+                                        &old_delegation->stateid) &&
                                delegation->type == old_delegation->type) {
                        goto out;
                }
                /*
                 * Deal with broken servers that hand out two
                 * delegations for the same file.
+                 * Allow for upgrades to a WRITE delegation, but
+                 * nothing else.
                 */
                dfprintk(FILE, "%s: server %s handed out "
                                "a duplicate delegation!\n",
                                __func__, clp->cl_hostname);
-                if (delegation->type <= old_delegation->type) {
+                if (delegation->type == old_delegation->type ||
+                    !(delegation->type & FMODE_WRITE)) {
                        freeme = delegation;
                        delegation = NULL;
                        goto out;
@@ -455,17 +456,24 @@ static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
        rcu_read_unlock();
 }
-static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
-{
-        nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
-}
 static void nfs_delegation_run_state_manager(struct nfs_client *clp)
 {
        if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
                nfs4_schedule_state_manager(clp);
 }
+void nfs_remove_bad_delegation(struct inode *inode)
+{
+        struct nfs_delegation *delegation;
+        delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+        if (delegation) {
+                nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+                nfs_free_delegation(delegation);
+        }
+}
+EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
 /**
 * nfs_expire_all_delegation_types
 * @clp: client to process
@@ -488,18 +496,6 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
        nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
-/**
- * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
- * @clp: client to process
- *
- */
-void nfs_handle_cb_pathdown(struct nfs_client *clp)
-{
-        if (clp == NULL)
-                return;
-        nfs_client_mark_return_all_delegations(clp);
-}
 static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
        struct nfs_delegation *delegation;
@@ -531,7 +527,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 /**
 * nfs_async_inode_return_delegation - asynchronously return a delegation
 * @inode: inode to process
- * @stateid: state ID information from CB_RECALL arguments
+ * @stateid: state ID information
 *
 * Returns zero on success, or a negative errno value.
 */
@@ -545,7 +541,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
+        if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) {
                rcu_read_unlock();
                return -ENOENT;
        }
@@ -684,21 +680,25 @@ int nfs_delegations_present(struct nfs_client *clp)
 * nfs4_copy_delegation_stateid - Copy inode's state ID information
 * @dst: stateid data structure to fill in
 * @inode: inode to check
+ * @flags: delegation type requirement
 *
- * Returns one and fills in "dst->data" * if inode had a delegation,
+ * Returns "true" and fills in "dst->data" * if inode had a delegation,
- * otherwise zero is returned.
+ * otherwise "false" is returned.
 */
-int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
+                fmode_t flags)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
-        int ret = 0;
+        bool ret;
+        flags &= FMODE_READ|FMODE_WRITE;
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
-        if (delegation != NULL) {
+        ret = (delegation != NULL && (delegation->type & flags) == flags);
-                memcpy(dst->data, delegation->stateid.data, sizeof(dst->data));
+        if (ret) {
-                ret = 1;
+                nfs4_stateid_copy(dst, &delegation->stateid);
+                nfs_mark_delegation_referenced(delegation);
        }
        rcu_read_unlock();
        return ret;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index d9322e490c56..cd6a7a8dadae 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -42,9 +42,9 @@ void nfs_super_return_all_delegations(struct super_block *sb);
 void nfs_expire_all_delegations(struct nfs_client *clp);
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
-void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
 int nfs_delegations_present(struct nfs_client *clp);
+void nfs_remove_bad_delegation(struct inode *inode);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -53,7 +53,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
 int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
-int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32aa6917265a..4aaf0316d76a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -207,7 +207,7 @@ struct nfs_cache_array_entry {
 };
 struct nfs_cache_array {
-        unsigned int size;
+        int size;
        int eof_index;
        u64 last_cookie;
        struct nfs_cache_array_entry array[0];
@@ -1429,6 +1429,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        }
        open_flags = nd->intent.open.flags;
+        attr.ia_valid = 0;
        ctx = create_nfs_open_context(dentry, open_flags);
        res = ERR_CAST(ctx);
@@ -1437,11 +1438,14 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
-                attr.ia_valid = ATTR_MODE;
+                attr.ia_valid |= ATTR_MODE;
                attr.ia_mode &= ~current_umask();
-        } else {
+        } else
                open_flags &= ~(O_EXCL | O_CREAT);
-                attr.ia_valid = 0;
+        if (open_flags & O_TRUNC) {
+                attr.ia_valid |= ATTR_SIZE;
+                attr.ia_size = 0;
        }
        /* Open the file on the server */
@@ -1495,6 +1499,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct inode *inode;
        struct inode *dir;
        struct nfs_open_context *ctx;
+        struct iattr attr;
        int openflags, ret = 0;
        if (nd->flags & LOOKUP_RCU)
@@ -1523,19 +1528,27 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* We cannot do exclusive creation on a positive dentry */
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
                goto no_open_dput;
-        /* We can't create new files, or truncate existing ones here */
+        /* We can't create new files here */
-        openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+        openflags &= ~(O_CREAT|O_EXCL);
        ctx = create_nfs_open_context(dentry, openflags);
        ret = PTR_ERR(ctx);
        if (IS_ERR(ctx))
                goto out;
+        attr.ia_valid = 0;
+        if (openflags & O_TRUNC) {
+                attr.ia_valid |= ATTR_SIZE;
+                attr.ia_size = 0;
+                nfs_wb_all(inode);
+        }
        /*
         * Note: we're not holding inode->i_mutex and so may be racing with
         * operations that change the directory. We therefore save the
         * change attribute *before* we do the RPC call.
         */
-        inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+        inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                switch (ret) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ea5be1262d41..481be7f7bdd3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -264,9 +264,7 @@ static void nfs_direct_read_release(void *calldata)
 }
 static const struct rpc_call_ops nfs_read_direct_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_read_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_read_result,
        .rpc_release = nfs_direct_read_release,
 };
@@ -553,9 +551,7 @@ static void nfs_direct_commit_release(void *calldata)
 }
 static const struct rpc_call_ops nfs_commit_direct_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_commit_result,
        .rpc_release = nfs_direct_commit_release,
 };
@@ -695,9 +691,7 @@ out_unlock:
 }
 static const struct rpc_call_ops nfs_write_direct_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_write_result,
        .rpc_release = nfs_direct_write_release,
 };
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index a6e711ad130f..b3924b8a6000 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -10,8 +10,9 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/dns_resolver.h>
+#include "dns_resolve.h"
-ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
                struct sockaddr *sa, size_t salen)
 {
        ssize_t ret;
@@ -20,7 +21,7 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
        ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
        if (ip_len > 0)
-                ret = rpc_pton(ip_addr, ip_len, sa, salen);
+                ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
        else
                ret = -ESRCH;
        kfree(ip_addr);
@@ -40,15 +41,15 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
 #include "dns_resolve.h"
 #include "cache_lib.h"
+#include "netns.h"
 #define NFS_DNS_HASHBITS 4
 #define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
-static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
 struct nfs_dns_ent {
        struct cache_head h;
@@ -224,7 +225,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
        len = qword_get(&buf, buf1, sizeof(buf1));
        if (len <= 0)
                goto out;
-        key.addrlen = rpc_pton(buf1, len,
+        key.addrlen = rpc_pton(cd->net, buf1, len,
                        (struct sockaddr *)&key.addr,
                        sizeof(key.addr));
@@ -259,21 +260,6 @@ out:
        return ret;
 }
-static struct cache_detail nfs_dns_resolve = {
-        .owner = THIS_MODULE,
-        .hash_size = NFS_DNS_HASHTBL_SIZE,
-        .hash_table = nfs_dns_table,
-        .name = "dns_resolve",
-        .cache_put = nfs_dns_ent_put,
-        .cache_upcall = nfs_dns_upcall,
-        .cache_parse = nfs_dns_parse,
-        .cache_show = nfs_dns_show,
-        .match = nfs_dns_match,
-        .init = nfs_dns_ent_init,
-        .update = nfs_dns_ent_update,
-        .alloc = nfs_dns_ent_alloc,
-};
 static int do_cache_lookup(struct cache_detail *cd,
                struct nfs_dns_ent *key,
                struct nfs_dns_ent **item,
@@ -336,8 +322,8 @@ out:
        return ret;
 }
-ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ssize_t nfs_dns_resolve_name(struct net *net, char *name,
-                struct sockaddr *sa, size_t salen)
+                size_t namelen, struct sockaddr *sa, size_t salen)
 {
        struct nfs_dns_ent key = {
                .hostname = name,
@@ -345,28 +331,118 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
        };
        struct nfs_dns_ent *item = NULL;
        ssize_t ret;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+        ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
        if (ret == 0) {
                if (salen >= item->addrlen) {
                        memcpy(sa, &item->addr, item->addrlen);
                        ret = item->addrlen;
                } else
                        ret = -EOVERFLOW;
-                cache_put(&item->h, &nfs_dns_resolve);
+                cache_put(&item->h, nn->nfs_dns_resolve);
        } else if (ret == -ENOENT)
                ret = -ESRCH;
        return ret;
 }
+int nfs_dns_resolver_cache_init(struct net *net)
+{
+        int err = -ENOMEM;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct cache_detail *cd;
+        struct cache_head **tbl;
+        cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
+        if (cd == NULL)
+                goto err_cd;
+        tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *),
+                        GFP_KERNEL);
+        if (tbl == NULL)
+                goto err_tbl;
+        cd->owner = THIS_MODULE,
+        cd->hash_size = NFS_DNS_HASHTBL_SIZE,
+        cd->hash_table = tbl,
+        cd->name = "dns_resolve",
+        cd->cache_put = nfs_dns_ent_put,
+        cd->cache_upcall = nfs_dns_upcall,
+        cd->cache_parse = nfs_dns_parse,
+        cd->cache_show = nfs_dns_show,
+        cd->match = nfs_dns_match,
+        cd->init = nfs_dns_ent_init,
+        cd->update = nfs_dns_ent_update,
+        cd->alloc = nfs_dns_ent_alloc,
+        nfs_cache_init(cd);
+        err = nfs_cache_register_net(net, cd);
+        if (err)
+                goto err_reg;
+        nn->nfs_dns_resolve = cd;
+        return 0;
+err_reg:
+        nfs_cache_destroy(cd);
+        kfree(cd->hash_table);
+err_tbl:
+        kfree(cd);
+err_cd:
+        return err;
+}
+void nfs_dns_resolver_cache_destroy(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct cache_detail *cd = nn->nfs_dns_resolve;
+        nfs_cache_unregister_net(net, cd);
+        nfs_cache_destroy(cd);
+        kfree(cd->hash_table);
+        kfree(cd);
+}
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+                           void *ptr)
+{
+        struct super_block *sb = ptr;
+        struct net *net = sb->s_fs_info;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct cache_detail *cd = nn->nfs_dns_resolve;
+        int ret = 0;
+        if (cd == NULL)
+                return 0;
+        if (!try_module_get(THIS_MODULE))
+                return 0;
+        switch (event) {
+        case RPC_PIPEFS_MOUNT:
+                ret = nfs_cache_register_sb(sb, cd);
+                break;
+        case RPC_PIPEFS_UMOUNT:
+                nfs_cache_unregister_sb(sb, cd);
+                break;
+        default:
+                ret = -ENOTSUPP;
+                break;
+        }
+        module_put(THIS_MODULE);
+        return ret;
+}
+static struct notifier_block nfs_dns_resolver_block = {
+        .notifier_call  = rpc_pipefs_event,
+};
 int nfs_dns_resolver_init(void)
 {
-        return nfs_cache_register(&nfs_dns_resolve);
+        return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
 }
 void nfs_dns_resolver_destroy(void)
 {
-        nfs_cache_unregister(&nfs_dns_resolve);
+        rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
 }
 #endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index 199bb5543a91..2e4f596d2923 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -15,12 +15,22 @@ static inline int nfs_dns_resolver_init(void)
 static inline void nfs_dns_resolver_destroy(void)
 {}
+static inline int nfs_dns_resolver_cache_init(struct net *net)
+{
+        return 0;
+}
+static inline void nfs_dns_resolver_cache_destroy(struct net *net)
+{}
 #else
 extern int nfs_dns_resolver_init(void);
 extern void nfs_dns_resolver_destroy(void);
+extern int nfs_dns_resolver_cache_init(struct net *net);
+extern void nfs_dns_resolver_cache_destroy(struct net *net);
 #endif
-extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
-                struct sockaddr *sa, size_t salen);
+                size_t namelen, struct sockaddr *sa, size_t salen);
 #endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a77a1f2da5d6..aa9b709fd328 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -529,6 +529,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (mapping != dentry->d_inode->i_mapping)
                goto out_unlock;
+        wait_on_page_writeback(page);
        pagelen = nfs_page_length(page);
        if (pagelen == 0)
                goto out_unlock;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 419119c371bf..ae65c16b3670 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_server *nfss = NFS_SERVER(inode);
-        struct fscache_cookie *old = nfsi->fscache;
+        NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);
        nfs_fscache_inode_lock(inode);
        if (nfsi->fscache) {
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index a1bbf7780dfc..b7f348bb618b 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,11 +34,29 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <linux/types.h>
-#include <linux/string.h>
+#include <linux/parser.h>
-#include <linux/kernel.h>
+#include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/nfs_idmap.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/module.h>
+#include "internal.h"
+#include "netns.h"
+#define NFS_UINT_MAXLEN 11
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600;
+static const struct cred *id_resolver_cache;
+static struct key_type key_type_id_resolver_legacy;
 /**
 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
@@ -142,24 +160,7 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
        return snprintf(buf, buflen, "%u", id);
 }
-#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+static struct key_type key_type_id_resolver = {
-#include <linux/cred.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/nfs4.h>
-#include <linux/nfs_fs_sb.h>
-#include <linux/keyctl.h>
-#include <linux/key-type.h>
-#include <linux/rcupdate.h>
-#include <linux/err.h>
-#include <keys/user-type.h>
-#define NFS_UINT_MAXLEN 11
-const struct cred *id_resolver_cache;
-struct key_type key_type_id_resolver = {
        .name           = "id_resolver",
        .instantiate    = user_instantiate,
        .match          = user_match,
@@ -169,13 +170,14 @@ struct key_type key_type_id_resolver = {
        .read           = user_read,
 };
-int nfs_idmap_init(void)
+static int nfs_idmap_init_keyring(void)
 {
        struct cred *cred;
        struct key *keyring;
        int ret = 0;
-        printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+        printk(KERN_NOTICE "NFS: Registering the %s key type\n",
+                key_type_id_resolver.name);
        cred = prepare_kernel_cred(NULL);
        if (!cred)
@@ -211,7 +213,7 @@ failed_put_cred:
        return ret;
 }
-void nfs_idmap_quit(void)
+static void nfs_idmap_quit_keyring(void)
 {
        key_revoke(id_resolver_cache->thread_keyring);
        unregister_key_type(&key_type_id_resolver);
@@ -246,8 +248,10 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
        return desclen;
 }
-static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+static ssize_t nfs_idmap_request_key(struct key_type *key_type,
-                const char *type, void *data, size_t data_size)
+                                     const char *name, size_t namelen,
+                                     const char *type, void *data,
+                                     size_t data_size, struct idmap *idmap)
 {
        const struct cred *saved_cred;
        struct key *rkey;
@@ -260,8 +264,12 @@ static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
                goto out;
        saved_cred = override_creds(id_resolver_cache);
-        rkey = request_key(&key_type_id_resolver, desc, "");
+        if (idmap)
+                rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
+        else
+                rkey = request_key(&key_type_id_resolver, desc, "");
        revert_creds(saved_cred);
        kfree(desc);
        if (IS_ERR(rkey)) {
                ret = PTR_ERR(rkey);
@@ -294,31 +302,46 @@ out:
        return ret;
 }
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+                                 const char *type, void *data,
+                                 size_t data_size, struct idmap *idmap)
+{
+        ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
+                                            name, namelen, type, data,
+                                            data_size, NULL);
+        if (ret < 0) {
+                ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
+                                            name, namelen, type, data,
+                                            data_size, idmap);
+        }
+        return ret;
+}
 /* ID -> Name */
-static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
+                                     size_t buflen, struct idmap *idmap)
 {
        char id_str[NFS_UINT_MAXLEN];
        int id_len;
        ssize_t ret;
        id_len = snprintf(id_str, sizeof(id_str), "%u", id);
-        ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+        ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);
        if (ret < 0)
                return -EINVAL;
        return ret;
 }
 /* Name -> ID */
-static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type,
-                                const char *type, __u32 *id)
+                               __u32 *id, struct idmap *idmap)
 {
        char id_str[NFS_UINT_MAXLEN];
        long id_long;
        ssize_t data_size;
        int ret = 0;
-        data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+        data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);
        if (data_size <= 0) {
                ret = -EINVAL;
        } else {
@@ -328,114 +351,103 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
        return ret;
 }
-int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+/* idmap classic begins here */
-{
+module_param(nfs_idmap_cache_timeout, int, 0644);
-        if (nfs_map_string_to_numeric(name, namelen, uid))
-                return 0;
-        return nfs_idmap_lookup_id(name, namelen, "uid", uid);
-}
-int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
-{
-        if (nfs_map_string_to_numeric(name, namelen, gid))
-                return 0;
-        return nfs_idmap_lookup_id(name, namelen, "gid", gid);
-}
-int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
-{
-        int ret = -EINVAL;
-        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
-                ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
-        if (ret < 0)
-                ret = nfs_map_numeric_to_string(uid, buf, buflen);
-        return ret;
-}
-int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
-{
-        int ret = -EINVAL;
-        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+struct idmap {
-                ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+        struct rpc_pipe         *idmap_pipe;
-        if (ret < 0)
+        struct key_construction *idmap_key_cons;
-                ret = nfs_map_numeric_to_string(gid, buf, buflen);
-        return ret;
-}
-#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/init.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/sched.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/workqueue.h>
-#include <linux/sunrpc/rpc_pipe_fs.h>
-#include <linux/nfs_fs.h>
-#include "nfs4_fs.h"
-#define IDMAP_HASH_SZ          128
-/* Default cache timeout is 10 minutes */
-unsigned int nfs_idmap_cache_timeout = 600 * HZ;
-static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
-{
-        char *endp;
-        int num = simple_strtol(val, &endp, 0);
-        int jif = num * HZ;
-        if (endp == val || *endp || num < 0 || jif < num)
-                return -EINVAL;
-        *((int *)kp->arg) = jif;
-        return 0;
-}
-module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
-                 &nfs_idmap_cache_timeout, 0644);
-struct idmap_hashent {
-        unsigned long           ih_expires;
-        __u32                   ih_id;
-        size_t                  ih_namelen;
-        char                    ih_name[IDMAP_NAMESZ];
 };
-struct idmap_hashtable {
+enum {
-        __u8                    h_type;
+        Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err
-        struct idmap_hashent    h_entries[IDMAP_HASH_SZ];
 };
-struct idmap {
+static const match_table_t nfs_idmap_tokens = {
-        struct dentry           *idmap_dentry;
+        { Opt_find_uid, "uid:%s" },
-        wait_queue_head_t       idmap_wq;
+        { Opt_find_gid, "gid:%s" },
-        struct idmap_msg        idmap_im;
+        { Opt_find_user, "user:%s" },
-        struct mutex            idmap_lock;     /* Serializes upcalls */
+        { Opt_find_group, "group:%s" },
-        struct mutex            idmap_im_lock;  /* Protects the hashtable */
+        { Opt_find_err, NULL }
-        struct idmap_hashtable  idmap_user_hash;
-        struct idmap_hashtable  idmap_group_hash;
 };
+static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
                                   size_t);
 static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
-static unsigned int fnvhash32(const void *, size_t);
 static const struct rpc_pipe_ops idmap_upcall_ops = {
        .upcall         = rpc_pipe_generic_upcall,
        .downcall       = idmap_pipe_downcall,
        .destroy_msg    = idmap_pipe_destroy_msg,
 };
+static struct key_type key_type_id_resolver_legacy = {
+        .name           = "id_resolver",
+        .instantiate    = user_instantiate,
+        .match          = user_match,
+        .revoke         = user_revoke,
+        .destroy        = user_destroy,
+        .describe       = user_describe,
+        .read           = user_read,
+        .request_key    = nfs_idmap_legacy_upcall,
+};
+static void __nfs_idmap_unregister(struct rpc_pipe *pipe)
+{
+        if (pipe->dentry)
+                rpc_unlink(pipe->dentry);
+}
+static int __nfs_idmap_register(struct dentry *dir,
+                                     struct idmap *idmap,
+                                     struct rpc_pipe *pipe)
+{
+        struct dentry *dentry;
+        dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
+        if (IS_ERR(dentry))
+                return PTR_ERR(dentry);
+        pipe->dentry = dentry;
+        return 0;
+}
+static void nfs_idmap_unregister(struct nfs_client *clp,
+                                      struct rpc_pipe *pipe)
+{
+        struct net *net = clp->net;
+        struct super_block *pipefs_sb;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (pipefs_sb) {
+                __nfs_idmap_unregister(pipe);
+                rpc_put_sb_net(net);
+        }
+}
+static int nfs_idmap_register(struct nfs_client *clp,
+                                   struct idmap *idmap,
+                                   struct rpc_pipe *pipe)
+{
+        struct net *net = clp->net;
+        struct super_block *pipefs_sb;
+        int err = 0;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (pipefs_sb) {
+                if (clp->cl_rpcclient->cl_dentry)
+                        err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
+                                                   idmap, pipe);
+                rpc_put_sb_net(net);
+        }
+        return err;
+}
 int
 nfs_idmap_new(struct nfs_client *clp)
 {
        struct idmap *idmap;
+        struct rpc_pipe *pipe;
        int error;
        BUG_ON(clp->cl_idmap != NULL);
@@ -444,19 +456,19 @@ nfs_idmap_new(struct nfs_client *clp)
        if (idmap == NULL)
                return -ENOMEM;
-        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+        pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
-                        "idmap", idmap, &idmap_upcall_ops, 0);
+        if (IS_ERR(pipe)) {
-        if (IS_ERR(idmap->idmap_dentry)) {
+                error = PTR_ERR(pipe);
-                error = PTR_ERR(idmap->idmap_dentry);
                kfree(idmap);
                return error;
        }
+        error = nfs_idmap_register(clp, idmap, pipe);
-        mutex_init(&idmap->idmap_lock);
+        if (error) {
-        mutex_init(&idmap->idmap_im_lock);
+                rpc_destroy_pipe_data(pipe);
-        init_waitqueue_head(&idmap->idmap_wq);
+                kfree(idmap);
-        idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
+                return error;
-        idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
+        }
+        idmap->idmap_pipe = pipe;
        clp->cl_idmap = idmap;
        return 0;
@@ -469,211 +481,220 @@ nfs_idmap_delete(struct nfs_client *clp)
        if (!idmap)
                return;
-        rpc_unlink(idmap->idmap_dentry);
+        nfs_idmap_unregister(clp, idmap->idmap_pipe);
+        rpc_destroy_pipe_data(idmap->idmap_pipe);
        clp->cl_idmap = NULL;
        kfree(idmap);
 }
-/*
+static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
- * Helper routines for manipulating the hashtable
+                              struct super_block *sb)
- */
-static inline struct idmap_hashent *
-idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len)
-{
-        return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ];
-}
-static struct idmap_hashent *
-idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
 {
-        struct idmap_hashent *he = idmap_name_hash(h, name, len);
+        int err = 0;
-        if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
+        switch (event) {
-                return NULL;
+        case RPC_PIPEFS_MOUNT:
-        if (time_after(jiffies, he->ih_expires))
+                BUG_ON(clp->cl_rpcclient->cl_dentry == NULL);
-                return NULL;
+                err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
-        return he;
+                                                clp->cl_idmap,
+                                                clp->cl_idmap->idmap_pipe);
+                break;
+        case RPC_PIPEFS_UMOUNT:
+                if (clp->cl_idmap->idmap_pipe) {
+                        struct dentry *parent;
+                        parent = clp->cl_idmap->idmap_pipe->dentry->d_parent;
+                        __nfs_idmap_unregister(clp->cl_idmap->idmap_pipe);
+                        /*
+                         * Note: This is a dirty hack. SUNRPC hook has been
+                         * called already but simple_rmdir() call for the
+                         * directory returned with error because of idmap pipe
+                         * inside. Thus now we have to remove this directory
+                         * here.
+                         */
+                        if (rpc_rmdir(parent))
+                                printk(KERN_ERR "NFS: %s: failed to remove "
+                                        "clnt dir!\n", __func__);
+                }
+                break;
+        default:
+                printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__,
+                        event);
+                return -ENOTSUPP;
+        }
+        return err;
+}
+static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct dentry *cl_dentry;
+        struct nfs_client *clp;
+        spin_lock(&nn->nfs_client_lock);
+        list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+                if (clp->rpc_ops != &nfs_v4_clientops)
+                        continue;
+                cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
+                if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) ||
+                    ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry))
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nn->nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nn->nfs_client_lock);
+        return NULL;
 }
-static inline struct idmap_hashent *
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
-idmap_id_hash(struct idmap_hashtable* h, __u32 id)
+                            void *ptr)
 {
-        return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ];
+        struct super_block *sb = ptr;
-}
+        struct nfs_client *clp;
+        int error = 0;
-static struct idmap_hashent *
+        while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) {
-idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
+                error = __rpc_pipefs_event(clp, event, sb);
-{
+                nfs_put_client(clp);
-        struct idmap_hashent *he = idmap_id_hash(h, id);
+                if (error)
-        if (he->ih_id != id || he->ih_namelen == 0)
+                        break;
-                return NULL;
+        }
-        if (time_after(jiffies, he->ih_expires))
+        return error;
-                return NULL;
-        return he;
 }
-/*
+#define PIPEFS_NFS_PRIO         1
- * Routines for allocating new entries in the hashtable.
- * For now, we just have 1 entry per bucket, so it's all
+static struct notifier_block nfs_idmap_block = {
- * pretty trivial.
+        .notifier_call  = rpc_pipefs_event,
- */
+        .priority       = SUNRPC_PIPEFS_NFS_PRIO,
-static inline struct idmap_hashent *
+};
-idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
-{
-        return idmap_name_hash(h, name, len);
-}
-static inline struct idmap_hashent *
+int nfs_idmap_init(void)
-idmap_alloc_id(struct idmap_hashtable *h, __u32 id)
 {
-        return idmap_id_hash(h, id);
+        int ret;
+        ret = nfs_idmap_init_keyring();
+        if (ret != 0)
+                goto out;
+        ret = rpc_pipefs_notifier_register(&nfs_idmap_block);
+        if (ret != 0)
+                nfs_idmap_quit_keyring();
+out:
+        return ret;
 }
-static void
+void nfs_idmap_quit(void)
-idmap_update_entry(struct idmap_hashent *he, const char *name,
-                size_t namelen, __u32 id)
 {
-        he->ih_id = id;
+        rpc_pipefs_notifier_unregister(&nfs_idmap_block);
-        memcpy(he->ih_name, name, namelen);
+        nfs_idmap_quit_keyring();
-        he->ih_name[namelen] = '\0';
-        he->ih_namelen = namelen;
-        he->ih_expires = jiffies + nfs_idmap_cache_timeout;
 }
-/*
+static int nfs_idmap_prepare_message(char *desc, struct idmap_msg *im,
- * Name -> ID
+                                     struct rpc_pipe_msg *msg)
- */
-static int
-nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
-                const char *name, size_t namelen, __u32 *id)
 {
-        struct rpc_pipe_msg msg;
+        substring_t substr;
-        struct idmap_msg *im;
+        int token, ret;
-        struct idmap_hashent *he;
-        DECLARE_WAITQUEUE(wq, current);
-        int ret = -EIO;
-        im = &idmap->idmap_im;
-        /*
-         * String sanity checks
-         * Note that the userland daemon expects NUL terminated strings
-         */
-        for (;;) {
-                if (namelen == 0)
-                        return -EINVAL;
-                if (name[namelen-1] != '\0')
-                        break;
-                namelen--;
-        }
-        if (namelen >= IDMAP_NAMESZ)
-                return -EINVAL;
-        mutex_lock(&idmap->idmap_lock);
+        memset(im,  0, sizeof(*im));
-        mutex_lock(&idmap->idmap_im_lock);
+        memset(msg, 0, sizeof(*msg));
-        he = idmap_lookup_name(h, name, namelen);
-        if (he != NULL) {
-                *id = he->ih_id;
-                ret = 0;
-                goto out;
-        }
-        memset(im, 0, sizeof(*im));
+        im->im_type = IDMAP_TYPE_GROUP;
-        memcpy(im->im_name, name, namelen);
+        token = match_token(desc, nfs_idmap_tokens, &substr);
-        im->im_type = h->h_type;
+        switch (token) {
-        im->im_conv = IDMAP_CONV_NAMETOID;
+        case Opt_find_uid:
+                im->im_type = IDMAP_TYPE_USER;
+        case Opt_find_gid:
+                im->im_conv = IDMAP_CONV_NAMETOID;
+                ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
+                break;
-        memset(&msg, 0, sizeof(msg));
+        case Opt_find_user:
-        msg.data = im;
+                im->im_type = IDMAP_TYPE_USER;
-        msg.len = sizeof(*im);
+        case Opt_find_group:
+                im->im_conv = IDMAP_CONV_IDTONAME;
+                ret = match_int(&substr, &im->im_id);
+                break;
-        add_wait_queue(&idmap->idmap_wq, &wq);
+        default:
-        if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+                ret = -EINVAL;
-                remove_wait_queue(&idmap->idmap_wq, &wq);
                goto out;
        }
-        set_current_state(TASK_UNINTERRUPTIBLE);
+        msg->data = im;
-        mutex_unlock(&idmap->idmap_im_lock);
+        msg->len  = sizeof(struct idmap_msg);
-        schedule();
-        __set_current_state(TASK_RUNNING);
-        remove_wait_queue(&idmap->idmap_wq, &wq);
-        mutex_lock(&idmap->idmap_im_lock);
-        if (im->im_status & IDMAP_STATUS_SUCCESS) {
+out:
-                *id = im->im_id;
-                ret = 0;
-        }
- out:
-        memset(im, 0, sizeof(*im));
-        mutex_unlock(&idmap->idmap_im_lock);
-        mutex_unlock(&idmap->idmap_lock);
        return ret;
 }
-/*
+static int nfs_idmap_legacy_upcall(struct key_construction *cons,
- * ID -> Name
+                                   const char *op,
- */
+                                   void *aux)
-static int
-nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
-                __u32 id, char *name)
 {
-        struct rpc_pipe_msg msg;
+        struct rpc_pipe_msg *msg;
        struct idmap_msg *im;
-        struct idmap_hashent *he;
+        struct idmap *idmap = (struct idmap *)aux;
-        DECLARE_WAITQUEUE(wq, current);
+        struct key *key = cons->key;
-        int ret = -EIO;
+        int ret;
-        unsigned int len;
-        im = &idmap->idmap_im;
-        mutex_lock(&idmap->idmap_lock);
+        /* msg and im are freed in idmap_pipe_destroy_msg */
-        mutex_lock(&idmap->idmap_im_lock);
+        msg = kmalloc(sizeof(*msg), GFP_KERNEL);
+        if (IS_ERR(msg)) {
+                ret = PTR_ERR(msg);
+                goto out0;
+        }
-        he = idmap_lookup_id(h, id);
+        im = kmalloc(sizeof(*im), GFP_KERNEL);
-        if (he) {
+        if (IS_ERR(im)) {
-                memcpy(name, he->ih_name, he->ih_namelen);
+                ret = PTR_ERR(im);
-                ret = he->ih_namelen;
+                goto out1;
-                goto out;
        }
-        memset(im, 0, sizeof(*im));
+        ret = nfs_idmap_prepare_message(key->description, im, msg);
-        im->im_type = h->h_type;
+        if (ret < 0)
-        im->im_conv = IDMAP_CONV_IDTONAME;
+                goto out2;
-        im->im_id = id;
-        memset(&msg, 0, sizeof(msg));
+        idmap->idmap_key_cons = cons;
-        msg.data = im;
-        msg.len = sizeof(*im);
-        add_wait_queue(&idmap->idmap_wq, &wq);
+        ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
+        if (ret < 0)
+                goto out2;
-        if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+        return ret;
-                remove_wait_queue(&idmap->idmap_wq, &wq);
-                goto out;
+out2:
-        }
+        kfree(im);
+out1:
+        kfree(msg);
+out0:
+        key_revoke(cons->key);
+        key_revoke(cons->authkey);
+        return ret;
+}
+static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data)
+{
+        return key_instantiate_and_link(key, data, strlen(data) + 1,
+                                        id_resolver_cache->thread_keyring,
+                                        authkey);
+}
-        set_current_state(TASK_UNINTERRUPTIBLE);
+static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey)
-        mutex_unlock(&idmap->idmap_im_lock);
+{
-        schedule();
+        char id_str[NFS_UINT_MAXLEN];
-        __set_current_state(TASK_RUNNING);
+        int ret = -EINVAL;
-        remove_wait_queue(&idmap->idmap_wq, &wq);
-        mutex_lock(&idmap->idmap_im_lock);
+        switch (im->im_conv) {
+        case IDMAP_CONV_NAMETOID:
-        if (im->im_status & IDMAP_STATUS_SUCCESS) {
+                sprintf(id_str, "%d", im->im_id);
-                if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
+                ret = nfs_idmap_instantiate(key, authkey, id_str);
-                        goto out;
+                break;
-                memcpy(name, im->im_name, len);
+        case IDMAP_CONV_IDTONAME:
-                ret = len;
+                ret = nfs_idmap_instantiate(key, authkey, im->im_name);
+                break;
        }
- out:
-        memset(im, 0, sizeof(*im));
-        mutex_unlock(&idmap->idmap_im_lock);
-        mutex_unlock(&idmap->idmap_lock);
        return ret;
 }
@@ -682,115 +703,51 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
        struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
        struct idmap *idmap = (struct idmap *)rpci->private;
-        struct idmap_msg im_in, *im = &idmap->idmap_im;
+        struct key_construction *cons = idmap->idmap_key_cons;
-        struct idmap_hashtable *h;
+        struct idmap_msg im;
-        struct idmap_hashent *he = NULL;
        size_t namelen_in;
        int ret;
-        if (mlen != sizeof(im_in))
+        if (mlen != sizeof(im)) {
-                return -ENOSPC;
+                ret = -ENOSPC;
-        if (copy_from_user(&im_in, src, mlen) != 0)
-                return -EFAULT;
-        mutex_lock(&idmap->idmap_im_lock);
-        ret = mlen;
-        im->im_status = im_in.im_status;
-        /* If we got an error, terminate now, and wake up pending upcalls */
-        if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) {
-                wake_up(&idmap->idmap_wq);
                goto out;
        }
-        /* Sanity checking of strings */
+        if (copy_from_user(&im, src, mlen) != 0) {
-        ret = -EINVAL;
+                ret = -EFAULT;
-        namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ);
-        if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ)
                goto out;
+        }
-        switch (im_in.im_type) {
+        if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
-                case IDMAP_TYPE_USER:
+                ret = mlen;
-                        h = &idmap->idmap_user_hash;
+                complete_request_key(idmap->idmap_key_cons, -ENOKEY);
-                        break;
+                goto out_incomplete;
-                case IDMAP_TYPE_GROUP:
-                        h = &idmap->idmap_group_hash;
-                        break;
-                default:
-                        goto out;
        }
-        switch (im_in.im_conv) {
+        namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
-        case IDMAP_CONV_IDTONAME:
+        if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
-                /* Did we match the current upcall? */
+                ret = -EINVAL;
-                if (im->im_conv == IDMAP_CONV_IDTONAME
-                                && im->im_type == im_in.im_type
-                                && im->im_id == im_in.im_id) {
-                        /* Yes: copy string, including the terminating '\0'  */
-                        memcpy(im->im_name, im_in.im_name, namelen_in);
-                        im->im_name[namelen_in] = '\0';
-                        wake_up(&idmap->idmap_wq);
-                }
-                he = idmap_alloc_id(h, im_in.im_id);
-                break;
-        case IDMAP_CONV_NAMETOID:
-                /* Did we match the current upcall? */
-                if (im->im_conv == IDMAP_CONV_NAMETOID
-                                && im->im_type == im_in.im_type
-                                && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in
-                                && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) {
-                        im->im_id = im_in.im_id;
-                        wake_up(&idmap->idmap_wq);
-                }
-                he = idmap_alloc_name(h, im_in.im_name, namelen_in);
-                break;
-        default:
                goto out;
        }
-        /* If the entry is valid, also copy it to the cache */
+        ret = nfs_idmap_read_message(&im, cons->key, cons->authkey);
-        if (he != NULL)
+        if (ret >= 0) {
-                idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
+                key_set_timeout(cons->key, nfs_idmap_cache_timeout);
-        ret = mlen;
+                ret = mlen;
+        }
 out:
-        mutex_unlock(&idmap->idmap_im_lock);
+        complete_request_key(idmap->idmap_key_cons, ret);
+out_incomplete:
        return ret;
 }
 static void
 idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 {
-        struct idmap_msg *im = msg->data;
+        /* Free memory allocated in nfs_idmap_legacy_upcall() */
-        struct idmap *idmap = container_of(im, struct idmap, idmap_im); 
+        kfree(msg->data);
+        kfree(msg);
-        if (msg->errno >= 0)
-                return;
-        mutex_lock(&idmap->idmap_im_lock);
-        im->im_status = IDMAP_STATUS_LOOKUPFAIL;
-        wake_up(&idmap->idmap_wq);
-        mutex_unlock(&idmap->idmap_im_lock);
-}
-/* 
- * Fowler/Noll/Vo hash
- *    http://www.isthe.com/chongo/tech/comp/fnv/
- */
-#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */
-#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */
-static unsigned int fnvhash32(const void *buf, size_t buflen)
-{
-        const unsigned char *p, *end = (const unsigned char *)buf + buflen;
-        unsigned int hash = FNV_1_32;
-        for (p = buf; p < end; p++) {
-                hash *= FNV_P_32;
-                hash ^= (unsigned int)*p;
-        }
-        return hash;
 }
 int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
@@ -799,16 +756,16 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_
        if (nfs_map_string_to_numeric(name, namelen, uid))
                return 0;
-        return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
+        return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap);
 }
-int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
 {
        struct idmap *idmap = server->nfs_client->cl_idmap;
-        if (nfs_map_string_to_numeric(name, namelen, uid))
+        if (nfs_map_string_to_numeric(name, namelen, gid))
                return 0;
-        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
+        return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap);
 }
 int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
@@ -817,21 +774,19 @@ int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, s
        int ret = -EINVAL;
        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
-                ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+                ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap);
        if (ret < 0)
                ret = nfs_map_numeric_to_string(uid, buf, buflen);
        return ret;
 }
-int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
 {
        struct idmap *idmap = server->nfs_client->cl_idmap;
        int ret = -EINVAL;
        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
-                ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+                ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap);
        if (ret < 0)
-                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+                ret = nfs_map_numeric_to_string(gid, buf, buflen);
        return ret;
 }
-#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c2ce8196912c..e8bbfa5b3500 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <linux/compat.h>
 #include <linux/freezer.h>
+#include <linux/crc32.h>
 #include <asm/uaccess.h>
@@ -50,6 +51,7 @@
 #include "fscache.h"
 #include "dns_resolve.h"
 #include "pnfs.h"
+#include "netns.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -387,9 +389,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                unlock_new_inode(inode);
        } else
                nfs_refresh_inode(inode, fattr);
-        dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n",
+        dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
                inode->i_sb->s_id,
                (long long)NFS_FILEID(inode),
+                nfs_display_fhandle_hash(fh),
                atomic_read(&inode->i_count));
 out:
@@ -400,7 +403,7 @@ out_no_inode:
        goto out;
 }
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
 int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -422,7 +425,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        /* Optimization: if the end result is no change, don't RPC */
        attr->ia_valid &= NFS_VALID_ATTRS;
-        if ((attr->ia_valid & ~ATTR_FILE) == 0)
+        if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
                return 0;
        /* Write all dirty data */
@@ -1043,6 +1046,67 @@ struct nfs_fh *nfs_alloc_fhandle(void)
        return fh;
 }
+#ifdef NFS_DEBUG
+/*
+ * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle
+ *                             in the same way that wireshark does
+ *
+ * @fh: file handle
+ *
+ * For debugging only.
+ */
+u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
+{
+        /* wireshark uses 32-bit AUTODIN crc and does a bitwise
+         * not on the result */
+        return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size);
+}
+/*
+ * _nfs_display_fhandle - display an NFS file handle on the console
+ *
+ * @fh: file handle to display
+ * @caption: display caption
+ *
+ * For debugging only.
+ */
+void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
+{
+        unsigned short i;
+        if (fh == NULL || fh->size == 0) {
+                printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh);
+                return;
+        }
+        printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n",
+               caption, fh, fh->size, _nfs_display_fhandle_hash(fh));
+        for (i = 0; i < fh->size; i += 16) {
+                __be32 *pos = (__be32 *)&fh->data[i];
+                switch ((fh->size - i - 1) >> 2) {
+                case 0:
+                        printk(KERN_DEFAULT " %08x\n",
+                                be32_to_cpup(pos));
+                        break;
+                case 1:
+                        printk(KERN_DEFAULT " %08x %08x\n",
+                                be32_to_cpup(pos), be32_to_cpup(pos + 1));
+                        break;
+                case 2:
+                        printk(KERN_DEFAULT " %08x %08x %08x\n",
+                                be32_to_cpup(pos), be32_to_cpup(pos + 1),
+                                be32_to_cpup(pos + 2));
+                        break;
+                default:
+                        printk(KERN_DEFAULT " %08x %08x %08x %08x\n",
+                                be32_to_cpup(pos), be32_to_cpup(pos + 1),
+                                be32_to_cpup(pos + 2), be32_to_cpup(pos + 3));
+                }
+        }
+}
+#endif
 /**
 * nfs_inode_attrs_need_update - check if the inode attributes need updating
 * @inode - pointer to inode
@@ -1210,8 +1274,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        unsigned long now = jiffies;
        unsigned long save_cache_validity;
-        dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
+        dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n",
                        __func__, inode->i_sb->s_id, inode->i_ino,
+                        nfs_display_fhandle_hash(NFS_FH(inode)),
                        atomic_read(&inode->i_count), fattr->valid);
        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
@@ -1405,7 +1470,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /*
         * Big trouble! The inode has become a different object.
         */
-        printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
+        printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
                        __func__, inode->i_ino, inode->i_mode, fattr->mode);
 out_err:
        /*
@@ -1494,7 +1559,7 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&nfsi->open_files);
        INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
        INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
-        INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+        INIT_LIST_HEAD(&nfsi->commit_list);
        nfsi->npages = 0;
        nfsi->ncommit = 0;
        atomic_set(&nfsi->silly_count, 1);
@@ -1551,6 +1616,28 @@ static void nfsiod_stop(void)
        destroy_workqueue(wq);
 }
+int nfs_net_id;
+EXPORT_SYMBOL_GPL(nfs_net_id);
+static int nfs_net_init(struct net *net)
+{
+        nfs_clients_init(net);
+        return nfs_dns_resolver_cache_init(net);
+}
+static void nfs_net_exit(struct net *net)
+{
+        nfs_dns_resolver_cache_destroy(net);
+        nfs_cleanup_cb_ident_idr(net);
+}
+static struct pernet_operations nfs_net_ops = {
+        .init = nfs_net_init,
+        .exit = nfs_net_exit,
+        .id   = &nfs_net_id,
+        .size = sizeof(struct nfs_net),
+};
 /*
 * Initialize NFS
 */
@@ -1560,10 +1647,14 @@ static int __init init_nfs_fs(void)
        err = nfs_idmap_init();
        if (err < 0)
-                goto out9;
+                goto out10;
        err = nfs_dns_resolver_init();
        if (err < 0)
+                goto out9;
+        err = register_pernet_subsys(&nfs_net_ops);
+        if (err < 0)
                goto out8;
        err = nfs_fscache_register();
@@ -1599,14 +1690,14 @@ static int __init init_nfs_fs(void)
                goto out0;
 #ifdef CONFIG_PROC_FS
-        rpc_proc_register(&nfs_rpcstat);
+        rpc_proc_register(&init_net, &nfs_rpcstat);
 #endif
        if ((err = register_nfs_fs()) != 0)
                goto out;
        return 0;
 out:
 #ifdef CONFIG_PROC_FS
-        rpc_proc_unregister("nfs");
+        rpc_proc_unregister(&init_net, "nfs");
 #endif
        nfs_destroy_directcache();
 out0:
@@ -1624,10 +1715,12 @@ out5:
 out6:
        nfs_fscache_unregister();
 out7:
-        nfs_dns_resolver_destroy();
+        unregister_pernet_subsys(&nfs_net_ops);
 out8:
-        nfs_idmap_quit();
+        nfs_dns_resolver_destroy();
 out9:
+        nfs_idmap_quit();
+out10:
        return err;
 }
@@ -1639,12 +1732,12 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_inodecache();
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
+        unregister_pernet_subsys(&nfs_net_ops);
        nfs_dns_resolver_destroy();
        nfs_idmap_quit();
 #ifdef CONFIG_PROC_FS
-        rpc_proc_unregister("nfs");
+        rpc_proc_unregister(&init_net, "nfs");
 #endif
-        nfs_cleanup_cb_ident_idr();
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8102db9b926c..2476dc69365f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -123,6 +123,7 @@ struct nfs_parsed_mount_data {
        } nfs_server;
        struct security_mnt_opts lsm_opts;
+        struct net              *net;
 };
 /* mount_clnt.c */
@@ -137,20 +138,22 @@ struct nfs_mount_request {
        int                     noresvport;
        unsigned int            *auth_flav_len;
        rpc_authflavor_t        *auth_flavs;
+        struct net              *net;
 };
 extern int nfs_mount(struct nfs_mount_request *info);
 extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
-extern struct rpc_program nfs_program;
+extern const struct rpc_program nfs_program;
+extern void nfs_clients_init(struct net *net);
-extern void nfs_cleanup_cb_ident_idr(void);
+extern void nfs_cleanup_cb_ident_idr(struct net *);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
+extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
-extern struct nfs_client *nfs4_find_client_ident(int);
 extern struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
+nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
+                                struct nfs4_sessionid *);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -329,6 +332,8 @@ void nfs_retry_commit(struct list_head *page_list,
 void nfs_commit_clear_lock(struct nfs_inode *nfsi);
 void nfs_commitdata_release(void *data);
 void nfs_commit_release_pages(struct nfs_write_data *data);
+void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head);
+void nfs_request_remove_commit_list(struct nfs_page *req);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d4c2d6b7507e..8e65c7f1f87c 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,7 +16,7 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
-#ifdef RPC_DEBUG
+#ifdef NFS_DEBUG
 # define NFSDBG_FACILITY        NFSDBG_MOUNT
 #endif
@@ -67,7 +67,7 @@ enum {
        MOUNTPROC3_EXPORT       = 5,
 };
-static struct rpc_program       mnt_program;
+static const struct rpc_program mnt_program;
 /*
 * Defined by OpenGroup XNFS Version 3W, chapter 8
@@ -153,7 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
                .rpc_resp       = &result,
        };
        struct rpc_create_args args = {
-                .net            = &init_net,
+                .net            = info->net,
                .protocol       = info->protocol,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -225,7 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                .to_retries = 2,
        };
        struct rpc_create_args args = {
-                .net            = &init_net,
+                .net            = info->net,
                .protocol       = IPPROTO_UDP,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -488,19 +488,19 @@ static struct rpc_procinfo mnt3_procedures[] = {
 };
-static struct rpc_version mnt_version1 = {
+static const struct rpc_version mnt_version1 = {
        .number         = 1,
        .nrprocs        = ARRAY_SIZE(mnt_procedures),
        .procs          = mnt_procedures,
 };
-static struct rpc_version mnt_version3 = {
+static const struct rpc_version mnt_version3 = {
        .number         = 3,
        .nrprocs        = ARRAY_SIZE(mnt3_procedures),
        .procs          = mnt3_procedures,
 };
-static struct rpc_version *mnt_version[] = {
+static const struct rpc_version *mnt_version[] = {
        NULL,
        &mnt_version1,
        NULL,
@@ -509,7 +509,7 @@ static struct rpc_version *mnt_version[] = {
 static struct rpc_stat mnt_stats;
-static struct rpc_program mnt_program = {
+static const struct rpc_program mnt_program = {
        .name           = "mount",
        .number         = NFS_MNT_PROGRAM,
        .nrvers         = ARRAY_SIZE(mnt_version),
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 8102391bb374..1807866bb3ab 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -276,7 +276,10 @@ out:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
 out_nofree:
-        dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+        if (IS_ERR(mnt))
+                dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt));
+        else
+                dprintk("<-- %s() = %p\n", __func__, mnt);
        return mnt;
 }
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
new file mode 100644
index 000000000000..aa14ec303e94
--- /dev/null
+++ b/fs/nfs/netns.h
@@ -0,0 +1,27 @@
+#ifndef __NFS_NETNS_H__
+#define __NFS_NETNS_H__
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+struct bl_dev_msg {
+        int32_t status;
+        uint32_t major, minor;
+};
+struct nfs_net {
+        struct cache_detail *nfs_dns_resolve;
+        struct rpc_pipe *bl_device_pipe;
+        struct bl_dev_msg bl_mount_reply;
+        wait_queue_head_t bl_wq;
+        struct list_head nfs_client_list;
+        struct list_head nfs_volume_list;
+#ifdef CONFIG_NFS_V4
+        struct idr cb_ident_idr; /* Protected by nfs_client_lock */
+#endif
+        spinlock_t nfs_client_lock;
+};
+extern int nfs_net_id;
+#endif
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 792cb13a4304..1f56000fabbd 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -1150,7 +1150,7 @@ struct rpc_procinfo	nfs_procedures[] = {
        PROC(STATFS,    fhandle,        statfsres,      0),
 };
-struct rpc_version              nfs_version2 = {
+const struct rpc_version nfs_version2 = {
        .number                 = 2,
        .nrprocs                = ARRAY_SIZE(nfs_procedures),
        .procs                  = nfs_procedures
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 7ef23979896d..e4498dc351a8 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
                .pages = pages,
        };
        struct nfs3_getaclres res = {
-                0
+                NULL,
        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 91943953a370..5242eae6711a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -428,6 +428,11 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
 }
+static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+        rpc_call_start(task);
+}
 static int
 nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
@@ -445,6 +450,11 @@ nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
 }
+static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+        rpc_call_start(task);
+}
 static int
 nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
                      struct inode *new_dir)
@@ -814,6 +824,11 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
+static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+{
+        rpc_call_start(task);
+}
 static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        if (nfs3_async_handle_jukebox(task, data->inode))
@@ -828,6 +843,11 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
+static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+{
+        rpc_call_start(task);
+}
 static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        if (nfs3_async_handle_jukebox(task, data->inode))
@@ -864,9 +884,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .create         = nfs3_proc_create,
        .remove         = nfs3_proc_remove,
        .unlink_setup   = nfs3_proc_unlink_setup,
+        .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
        .unlink_done    = nfs3_proc_unlink_done,
        .rename         = nfs3_proc_rename,
        .rename_setup   = nfs3_proc_rename_setup,
+        .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
        .rename_done    = nfs3_proc_rename_done,
        .link           = nfs3_proc_link,
        .symlink        = nfs3_proc_symlink,
@@ -879,8 +901,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .pathconf       = nfs3_proc_pathconf,
        .decode_dirent  = nfs3_decode_dirent,
        .read_setup     = nfs3_proc_read_setup,
+        .read_rpc_prepare = nfs3_proc_read_rpc_prepare,
        .read_done      = nfs3_read_done,
        .write_setup    = nfs3_proc_write_setup,
+        .write_rpc_prepare = nfs3_proc_write_rpc_prepare,
        .write_done     = nfs3_write_done,
        .commit_setup   = nfs3_proc_commit_setup,
        .commit_done    = nfs3_commit_done,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 183c6b123d0f..a77cc9a3ce55 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2461,7 +2461,7 @@ struct rpc_procinfo	nfs3_procedures[] = {
        PROC(COMMIT,            commit,         commit,         5),
 };
-struct rpc_version              nfs_version3 = {
+const struct rpc_version nfs_version3 = {
        .number                 = 3,
        .nrprocs                = ARRAY_SIZE(nfs3_procedures),
        .procs                  = nfs3_procedures
@@ -2489,7 +2489,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
        },
 };
-struct rpc_version              nfsacl_version3 = {
+const struct rpc_version nfsacl_version3 = {
        .number                 = 3,
        .nrprocs                = sizeof(nfs3_acl_procedures)/
                                  sizeof(nfs3_acl_procedures[0]),
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4d7d0aedc101..97ecc863dd76 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -20,7 +20,6 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
-        NFS4CLNT_LAYOUTRECALL,
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
        NFS4CLNT_LEASE_CONFIRM,
@@ -44,7 +43,7 @@ struct nfs4_minor_version_ops {
                        struct nfs4_sequence_args *args,
                        struct nfs4_sequence_res *res,
                        int cache_reply);
-        int     (*validate_stateid)(struct nfs_delegation *,
+        bool    (*match_stateid)(const nfs4_stateid *,
                        const nfs4_stateid *);
        int     (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
                        struct nfs_fsinfo *);
@@ -53,26 +52,25 @@ struct nfs4_minor_version_ops {
        const struct nfs4_state_maintenance_ops *state_renewal_ops;
 };
-/*
+struct nfs_unique_id {
- * struct rpc_sequence ensures that RPC calls are sent in the exact
+        struct rb_node rb_node;
- * order that they appear on the list.
+        __u64 id;
- */
-struct rpc_sequence {
-        struct rpc_wait_queue   wait;   /* RPC call delay queue */
-        spinlock_t lock;                /* Protects the list */
-        struct list_head list;          /* Defines sequence of RPC calls */
 };
 #define NFS_SEQID_CONFIRMED 1
 struct nfs_seqid_counter {
-        struct rpc_sequence *sequence;
+        int owner_id;
        int flags;
        u32 counter;
+        spinlock_t lock;                /* Protects the list */
+        struct list_head list;          /* Defines sequence of RPC calls */
+        struct rpc_wait_queue   wait;   /* RPC call delay queue */
 };
 struct nfs_seqid {
        struct nfs_seqid_counter *sequence;
        struct list_head list;
+        struct rpc_task *task;
 };
 static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
@@ -81,18 +79,12 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status
                seqid->flags |= NFS_SEQID_CONFIRMED;
 }
-struct nfs_unique_id {
-        struct rb_node rb_node;
-        __u64 id;
-};
 /*
 * NFS4 state_owners and lock_owners are simply labels for ordered
 * sequences of RPC calls. Their sole purpose is to provide once-only
 * semantics by allowing the server to identify replayed requests.
 */
 struct nfs4_state_owner {
-        struct nfs_unique_id so_owner_id;
        struct nfs_server    *so_server;
        struct list_head     so_lru;
        unsigned long        so_expires;
@@ -105,7 +97,6 @@ struct nfs4_state_owner {
        unsigned long        so_flags;
        struct list_head     so_states;
        struct nfs_seqid_counter so_seqid;
-        struct rpc_sequence  so_sequence;
 };
 enum {
@@ -146,8 +137,6 @@ struct nfs4_lock_state {
 #define NFS_LOCK_INITIALIZED 1
        int                     ls_flags;
        struct nfs_seqid_counter        ls_seqid;
-        struct rpc_sequence     ls_sequence;
-        struct nfs_unique_id    ls_id;
        nfs4_stateid            ls_stateid;
        atomic_t                ls_count;
        struct nfs4_lock_owner  ls_owner;
@@ -193,6 +182,7 @@ struct nfs4_exception {
        long timeout;
        int retry;
        struct nfs4_state *state;
+        struct inode *inode;
 };
 struct nfs4_state_recovery_ops {
@@ -224,7 +214,7 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, boo
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
-extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern int nfs4_release_lockowner(struct nfs4_lock_state *);
 extern const struct xattr_handler *nfs4_xattr_handlers[];
 #if defined(CONFIG_NFS_V4_1)
@@ -233,12 +223,13 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
        return server->nfs_client->cl_session;
 }
+extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);
 extern int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
-                int cache_reply, struct rpc_task *task);
+                struct rpc_task *task);
 extern int nfs41_setup_sequence(struct nfs4_session *session,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
-                int cache_reply, struct rpc_task *task);
+                struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *);
@@ -269,7 +260,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 static inline int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
-                int cache_reply, struct rpc_task *task)
+                struct rpc_task *task)
 {
        return 0;
 }
@@ -319,7 +310,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
 }
 #endif /* CONFIG_NFS_V4_1 */
-extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
 extern void nfs4_purge_state_owners(struct nfs_server *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
@@ -327,6 +318,8 @@ extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct nfs4_state *, fmode_t);
 extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
+extern void nfs_inode_find_state_and_recover(struct inode *inode,
+                const nfs4_stateid *stateid);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
@@ -337,7 +330,8 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
                                      struct server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
+extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
+                fmode_t, fl_owner_t, pid_t);
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -346,6 +340,8 @@ extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_release_seqid(struct nfs_seqid *seqid);
 extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
@@ -357,6 +353,16 @@ struct nfs4_mount_data;
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
+static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
+{
+        memcpy(dst, src, sizeof(*dst));
+}
+static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+        return memcmp(dst, src, sizeof(*dst)) == 0;
+}
 #else
 #define nfs4_close_state(a, b) do { } while (0)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 71ec08617e23..634c0bcb4fd6 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -33,7 +33,10 @@
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/sunrpc/metrics.h>
 #include "internal.h"
+#include "delegation.h"
 #include "nfs4filelayout.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
@@ -84,12 +87,27 @@ static int filelayout_async_handle_error(struct rpc_task *task,
                                         struct nfs_client *clp,
                                         int *reset)
 {
+        struct nfs_server *mds_server = NFS_SERVER(state->inode);
+        struct nfs_client *mds_client = mds_server->nfs_client;
        if (task->tk_status >= 0)
                return 0;
        *reset = 0;
        switch (task->tk_status) {
+        /* MDS state errors */
+        case -NFS4ERR_DELEG_REVOKED:
+        case -NFS4ERR_ADMIN_REVOKED:
+        case -NFS4ERR_BAD_STATEID:
+                nfs_remove_bad_delegation(state->inode);
+        case -NFS4ERR_OPENMODE:
+                nfs4_schedule_stateid_recovery(mds_server, state);
+                goto wait_on_recovery;
+        case -NFS4ERR_EXPIRED:
+                nfs4_schedule_stateid_recovery(mds_server, state);
+                nfs4_schedule_lease_recovery(mds_client);
+                goto wait_on_recovery;
+        /* DS session errors */
        case -NFS4ERR_BADSESSION:
        case -NFS4ERR_BADSLOT:
        case -NFS4ERR_BAD_HIGH_SLOT:
@@ -115,8 +133,14 @@ static int filelayout_async_handle_error(struct rpc_task *task,
                *reset = 1;
                break;
        }
+out:
        task->tk_status = 0;
        return -EAGAIN;
+wait_on_recovery:
+        rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+                rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+        goto out;
 }
 /* NFS_PROTO call done callback routines */
@@ -173,7 +197,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
        if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
                                &rdata->args.seq_args, &rdata->res.seq_res,
-                                0, task))
+                                task))
                return;
        rpc_call_start(task);
@@ -189,10 +213,18 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
        rdata->mds_ops->rpc_call_done(task, data);
 }
+static void filelayout_read_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics);
+}
 static void filelayout_read_release(void *data)
 {
        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        put_lseg(rdata->lseg);
        rdata->mds_ops->rpc_release(data);
 }
@@ -254,7 +286,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
        if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
                                &wdata->args.seq_args, &wdata->res.seq_res,
-                                0, task))
+                                task))
                return;
        rpc_call_start(task);
@@ -268,10 +300,18 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
        wdata->mds_ops->rpc_call_done(task, data);
 }
+static void filelayout_write_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics);
+}
 static void filelayout_write_release(void *data)
 {
        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        put_lseg(wdata->lseg);
        wdata->mds_ops->rpc_release(data);
 }
@@ -282,24 +322,28 @@ static void filelayout_commit_release(void *data)
        nfs_commit_release_pages(wdata);
        if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
                nfs_commit_clear_lock(NFS_I(wdata->inode));
+        put_lseg(wdata->lseg);
        nfs_commitdata_release(wdata);
 }
-struct rpc_call_ops filelayout_read_call_ops = {
+static const struct rpc_call_ops filelayout_read_call_ops = {
        .rpc_call_prepare = filelayout_read_prepare,
        .rpc_call_done = filelayout_read_call_done,
+        .rpc_count_stats = filelayout_read_count_stats,
        .rpc_release = filelayout_read_release,
 };
-struct rpc_call_ops filelayout_write_call_ops = {
+static const struct rpc_call_ops filelayout_write_call_ops = {
        .rpc_call_prepare = filelayout_write_prepare,
        .rpc_call_done = filelayout_write_call_done,
+        .rpc_count_stats = filelayout_write_count_stats,
        .rpc_release = filelayout_write_release,
 };
-struct rpc_call_ops filelayout_commit_call_ops = {
+static const struct rpc_call_ops filelayout_commit_call_ops = {
        .rpc_call_prepare = filelayout_write_prepare,
        .rpc_call_done = filelayout_write_call_done,
+        .rpc_count_stats = filelayout_write_count_stats,
        .rpc_release = filelayout_commit_release,
 };
@@ -367,7 +411,8 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
        idx = nfs4_fl_calc_ds_index(lseg, j);
        ds = nfs4_fl_prepare_ds(lseg, idx);
        if (!ds) {
-                printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+                printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
+                        __func__);
                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
                return PNFS_NOT_ATTEMPTED;
@@ -575,7 +620,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
                        goto out_err_free;
                fl->fh_array[i]->size = be32_to_cpup(p++);
                if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
-                        printk(KERN_ERR "Too big fh %d received %d\n",
+                        printk(KERN_ERR "NFS: Too big fh %d received %d\n",
                               i, fl->fh_array[i]->size);
                        goto out_err_free;
                }
@@ -640,14 +685,16 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
                int size = (fl->stripe_type == STRIPE_SPARSE) ?
                        fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-                fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags);
+                fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags);
                if (!fl->commit_buckets) {
                        filelayout_free_lseg(&fl->generic_hdr);
                        return NULL;
                }
                fl->number_of_buckets = size;
-                for (i = 0; i < size; i++)
+                for (i = 0; i < size; i++) {
-                        INIT_LIST_HEAD(&fl->commit_buckets[i]);
+                        INIT_LIST_HEAD(&fl->commit_buckets[i].written);
+                        INIT_LIST_HEAD(&fl->commit_buckets[i].committing);
+                }
        }
        return &fl->generic_hdr;
 }
@@ -679,7 +726,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
        return (p_stripe == r_stripe);
 }
-void
+static void
 filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                        struct nfs_page *req)
 {
@@ -696,7 +743,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                nfs_pageio_reset_read_mds(pgio);
 }
-void
+static void
 filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
                         struct nfs_page *req)
 {
@@ -725,11 +772,6 @@ static const struct nfs_pageio_ops filelayout_pg_write_ops = {
        .pg_doio = pnfs_generic_pg_writepages,
 };
-static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
-{
-        return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
-}
 static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
 {
        if (fl->stripe_type == STRIPE_SPARSE)
@@ -738,13 +780,49 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
                return j;
 }
-struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ */
+static void
+filelayout_clear_request_commit(struct nfs_page *req)
+{
+        struct pnfs_layout_segment *freeme = NULL;
+        struct inode *inode = req->wb_context->dentry->d_inode;
+        spin_lock(&inode->i_lock);
+        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+                goto out;
+        if (list_is_singular(&req->wb_list)) {
+                struct inode *inode = req->wb_context->dentry->d_inode;
+                struct pnfs_layout_segment *lseg;
+                /* From here we can find the bucket, but for the moment,
+                 * since there is only one relevant lseg...
+                 */
+                list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+                        if (lseg->pls_range.iomode == IOMODE_RW) {
+                                freeme = lseg;
+                                break;
+                        }
+                }
+        }
+out:
+        nfs_request_remove_commit_list(req);
+        spin_unlock(&inode->i_lock);
+        put_lseg(freeme);
+}
+static struct list_head *
+filelayout_choose_commit_list(struct nfs_page *req,
+                              struct pnfs_layout_segment *lseg)
 {
-        struct pnfs_layout_segment *lseg = req->wb_commit_lseg;
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        u32 i, j;
        struct list_head *list;
+        if (fl->commit_through_mds)
+                return &NFS_I(req->wb_context->dentry->d_inode)->commit_list;
        /* Note that we are calling nfs4_fl_calc_j_index on each page
         * that ends up being committed to a data server.  An attractive
         * alternative is to add a field to nfs_write_data and nfs_page
@@ -754,14 +832,30 @@ struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
        j = nfs4_fl_calc_j_index(lseg,
                                 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
        i = select_bucket_index(fl, j);
-        list = &fl->commit_buckets[i];
+        list = &fl->commit_buckets[i].written;
        if (list_empty(list)) {
-                /* Non-empty buckets hold a reference on the lseg */
+                /* Non-empty buckets hold a reference on the lseg.  That ref
+                 * is normally transferred to the COMMIT call and released
+                 * there.  It could also be released if the last req is pulled
+                 * off due to a rewrite, in which case it will be done in
+                 * filelayout_remove_commit_req
+                 */
                get_lseg(lseg);
        }
+        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
        return list;
 }
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+                struct pnfs_layout_segment *lseg)
+{
+        struct list_head *list;
+        list = filelayout_choose_commit_list(req, lseg);
+        nfs_request_add_commit_list(req, list);
+}
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
 {
        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
@@ -797,11 +891,12 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
        ds = nfs4_fl_prepare_ds(lseg, idx);
        if (!ds) {
-                printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+                printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
+                        __func__);
                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
                prepare_to_resend_writes(data);
-                data->mds_ops->rpc_release(data);
+                filelayout_commit_release(data);
                return -EAGAIN;
        }
        dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
@@ -817,24 +912,87 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
 /*
 * This is only useful while we are using whole file layouts.
 */
-static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
+static struct pnfs_layout_segment *
+find_only_write_lseg_locked(struct inode *inode)
 {
-        struct pnfs_layout_segment *lseg, *rv = NULL;
+        struct pnfs_layout_segment *lseg;
-        spin_lock(&inode->i_lock);
        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
                if (lseg->pls_range.iomode == IOMODE_RW)
-                        rv = get_lseg(lseg);
+                        return lseg;
+        return NULL;
+}
+static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
+{
+        struct pnfs_layout_segment *rv;
+        spin_lock(&inode->i_lock);
+        rv = find_only_write_lseg_locked(inode);
+        if (rv)
+                get_lseg(rv);
        spin_unlock(&inode->i_lock);
        return rv;
 }
-static int alloc_ds_commits(struct inode *inode, struct list_head *list)
+static int
+filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
+                spinlock_t *lock)
+{
+        struct list_head *src = &bucket->written;
+        struct list_head *dst = &bucket->committing;
+        struct nfs_page *req, *tmp;
+        int ret = 0;
+        list_for_each_entry_safe(req, tmp, src, wb_list) {
+                if (!nfs_lock_request(req))
+                        continue;
+                if (cond_resched_lock(lock))
+                        list_safe_reset_next(req, tmp, wb_list);
+                nfs_request_remove_commit_list(req);
+                clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+                nfs_list_add_request(req, dst);
+                ret++;
+                if (ret == max)
+                        break;
+        }
+        return ret;
+}
+/* Move reqs from written to committing lists, returning count of number moved.
+ * Note called with i_lock held.
+ */
+static int filelayout_scan_commit_lists(struct inode *inode, int max,
+                spinlock_t *lock)
+{
+        struct pnfs_layout_segment *lseg;
+        struct nfs4_filelayout_segment *fl;
+        int i, rv = 0, cnt;
+        lseg = find_only_write_lseg_locked(inode);
+        if (!lseg)
+                goto out_done;
+        fl = FILELAYOUT_LSEG(lseg);
+        if (fl->commit_through_mds)
+                goto out_done;
+        for (i = 0; i < fl->number_of_buckets && max != 0; i++) {
+                cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i],
+                                max, lock);
+                max -= cnt;
+                rv += cnt;
+        }
+out_done:
+        return rv;
+}
+static unsigned int
+alloc_ds_commits(struct inode *inode, struct list_head *list)
 {
        struct pnfs_layout_segment *lseg;
        struct nfs4_filelayout_segment *fl;
        struct nfs_write_data *data;
        int i, j;
+        unsigned int nreq = 0;
        /* Won't need this when non-whole file layout segments are supported
         * instead we will use a pnfs_layout_hdr structure */
@@ -843,28 +1001,27 @@ static int alloc_ds_commits(struct inode *inode, struct list_head *list)
                return 0;
        fl = FILELAYOUT_LSEG(lseg);
        for (i = 0; i < fl->number_of_buckets; i++) {
-                if (list_empty(&fl->commit_buckets[i]))
+                if (list_empty(&fl->commit_buckets[i].committing))
                        continue;
                data = nfs_commitdata_alloc();
                if (!data)
-                        goto out_bad;
+                        break;
                data->ds_commit_index = i;
                data->lseg = lseg;
                list_add(&data->pages, list);
+                nreq++;
        }
-        put_lseg(lseg);
-        return 0;
-out_bad:
+        /* Clean up on error */
        for (j = i; j < fl->number_of_buckets; j++) {
-                if (list_empty(&fl->commit_buckets[i]))
+                if (list_empty(&fl->commit_buckets[i].committing))
                        continue;
-                nfs_retry_commit(&fl->commit_buckets[i], lseg);
+                nfs_retry_commit(&fl->commit_buckets[i].committing, lseg);
                put_lseg(lseg);  /* associated with emptying bucket */
        }
        put_lseg(lseg);
        /* Caller will clean up entries put on list */
-        return -ENOMEM;
+        return nreq;
 }
 /* This follows nfs_commit_list pretty closely */
@@ -874,40 +1031,40 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 {
        struct nfs_write_data   *data, *tmp;
        LIST_HEAD(list);
+        unsigned int nreq = 0;
        if (!list_empty(mds_pages)) {
                data = nfs_commitdata_alloc();
-                if (!data)
+                if (data != NULL) {
-                        goto out_bad;
+                        data->lseg = NULL;
-                data->lseg = NULL;
+                        list_add(&data->pages, &list);
-                list_add(&data->pages, &list);
+                        nreq++;
+                } else
+                        nfs_retry_commit(mds_pages, NULL);
        }
-        if (alloc_ds_commits(inode, &list))
+        nreq += alloc_ds_commits(inode, &list);
-                goto out_bad;
+        if (nreq == 0) {
+                nfs_commit_clear_lock(NFS_I(inode));
+                goto out;
+        }
+        atomic_add(nreq, &NFS_I(inode)->commits_outstanding);
        list_for_each_entry_safe(data, tmp, &list, pages) {
                list_del_init(&data->pages);
-                atomic_inc(&NFS_I(inode)->commits_outstanding);
                if (!data->lseg) {
                        nfs_init_commit(data, mds_pages, NULL);
                        nfs_initiate_commit(data, NFS_CLIENT(inode),
                                            data->mds_ops, how);
                } else {
-                        nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg);
+                        nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg);
                        filelayout_initiate_commit(data, how);
                }
        }
-        return 0;
+out:
- out_bad:
+        return PNFS_ATTEMPTED;
-        list_for_each_entry_safe(data, tmp, &list, pages) {
-                nfs_retry_commit(&data->pages, data->lseg);
-                list_del_init(&data->pages);
-                nfs_commit_free(data);
-        }
-        nfs_retry_commit(mds_pages, NULL);
-        nfs_commit_clear_lock(NFS_I(inode));
-        return -ENOMEM;
 }
 static void
@@ -924,8 +1081,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .free_lseg              = filelayout_free_lseg,
        .pg_read_ops            = &filelayout_pg_read_ops,
        .pg_write_ops           = &filelayout_pg_write_ops,
-        .mark_pnfs_commit       = filelayout_mark_pnfs_commit,
+        .mark_request_commit    = filelayout_mark_request_commit,
-        .choose_commit_list     = filelayout_choose_commit_list,
+        .clear_request_commit   = filelayout_clear_request_commit,
+        .scan_commit_lists      = filelayout_scan_commit_lists,
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2e42284253fa..21190bb1f5e3 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -74,6 +74,11 @@ struct nfs4_file_layout_dsaddr {
        struct nfs4_pnfs_ds             *ds_list[1];
 };
+struct nfs4_fl_commit_bucket {
+        struct list_head written;
+        struct list_head committing;
+};
 struct nfs4_filelayout_segment {
        struct pnfs_layout_segment generic_hdr;
        u32 stripe_type;
@@ -84,7 +89,7 @@ struct nfs4_filelayout_segment {
        struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
        unsigned int num_fh;
        struct nfs_fh **fh_array;
-        struct list_head *commit_buckets; /* Sort commits to ds */
+        struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */
        int number_of_buckets;
 };
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 8ae91908f5aa..a866bbd2890a 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,7 +45,7 @@
 *   - incremented when a device id maps a data server already in the cache.
 *   - decremented when deviceid is removed from the cache.
 */
-DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
 static LIST_HEAD(nfs4_data_server_cache);
 /* Debug routines */
@@ -108,58 +108,40 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
        return false;
 }
-/*
+static bool
- * Lookup DS by addresses.  The first matching address returns true.
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
- * nfs4_ds_cache_lock is held
+                               const struct list_head *dsaddrs2)
- */
-static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(struct list_head *dsaddrs)
 {
-        struct nfs4_pnfs_ds *ds;
        struct nfs4_pnfs_ds_addr *da1, *da2;
-        list_for_each_entry(da1, dsaddrs, da_node) {
+        /* step through both lists, comparing as we go */
-                list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+        for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
-                        list_for_each_entry(da2, &ds->ds_addrs, da_node) {
+             da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
-                                if (same_sockaddr(
+             da1 != NULL && da2 != NULL;
-                                        (struct sockaddr *)&da1->da_addr,
+             da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
-                                        (struct sockaddr *)&da2->da_addr))
+             da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
-                                        return ds;
+                if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
-                        }
+                                   (struct sockaddr *)&da2->da_addr))
-                }
+                        return false;
        }
-        return NULL;
+        if (da1 == NULL && da2 == NULL)
+                return true;
+        return false;
 }
 /*
- * Compare two lists of addresses.
+ * Lookup DS by addresses.  nfs4_ds_cache_lock is held
 */
-static bool
+static struct nfs4_pnfs_ds *
-_data_server_match_all_addrs_locked(struct list_head *dsaddrs1,
+_data_server_lookup_locked(const struct list_head *dsaddrs)
-                                    struct list_head *dsaddrs2)
 {
-        struct nfs4_pnfs_ds_addr *da1, *da2;
+        struct nfs4_pnfs_ds *ds;
-        size_t count1 = 0,
-               count2 = 0;
-        list_for_each_entry(da1, dsaddrs1, da_node)
-                count1++;
-        list_for_each_entry(da2, dsaddrs2, da_node) {
-                bool found = false;
-                count2++;
-                list_for_each_entry(da1, dsaddrs1, da_node) {
-                        if (same_sockaddr((struct sockaddr *)&da1->da_addr,
-                                (struct sockaddr *)&da2->da_addr)) {
-                                found = true;
-                                break;
-                        }
-                }
-                if (!found)
-                        return false;
-        }
-        return (count1 == count2);
+        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+                if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+                        return ds;
+        return NULL;
 }
 /*
@@ -356,11 +338,6 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
                dprintk("%s add new data server %s\n", __func__,
                        ds->ds_remotestr);
        } else {
-                if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs,
-                                                         dsaddrs)) {
-                        dprintk("%s:  multipath address mismatch: %s != %s",
-                                __func__, tmp_ds->ds_remotestr, remotestr);
-                }
                kfree(remotestr);
                kfree(ds);
                atomic_inc(&tmp_ds->ds_count);
@@ -378,7 +355,7 @@ out:
 * Currently only supports ipv4, ipv6 and one multi-path address.
 */
 static struct nfs4_pnfs_ds_addr *
-decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
+decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
 {
        struct nfs4_pnfs_ds_addr *da = NULL;
        char *buf, *portstr;
@@ -457,7 +434,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
        INIT_LIST_HEAD(&da->da_node);
-        if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+        if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
                      sizeof(da->da_addr))) {
                dprintk("%s: error parsing address %s\n", __func__, buf);
                goto out_free_da;
@@ -554,7 +531,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        cnt = be32_to_cpup(p);
        dprintk("%s stripe count  %d\n", __func__, cnt);
        if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
-                printk(KERN_WARNING "%s: stripe count %d greater than "
+                printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
                       "supported maximum %d\n", __func__,
                        cnt, NFS4_PNFS_MAX_STRIPE_CNT);
                goto out_err_free_scratch;
@@ -585,7 +562,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        num = be32_to_cpup(p);
        dprintk("%s ds_num %u\n", __func__, num);
        if (num > NFS4_PNFS_MAX_MULTI_CNT) {
-                printk(KERN_WARNING "%s: multipath count %d greater than "
+                printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
                        "supported maximum %d\n", __func__,
                        num, NFS4_PNFS_MAX_MULTI_CNT);
                goto out_err_free_stripe_indices;
@@ -593,7 +570,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        /* validate stripe indices are all < num */
        if (max_stripe_index >= num) {
-                printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n",
+                printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
                        __func__, max_stripe_index, num);
                goto out_err_free_stripe_indices;
        }
@@ -625,7 +602,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
                mp_count = be32_to_cpup(p); /* multipath count */
                for (j = 0; j < mp_count; j++) {
-                        da = decode_ds_addr(&stream, gfp_flags);
+                        da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net,
+                                            &stream, gfp_flags);
                        if (da)
                                list_add_tail(&da->da_node, &dsaddrs);
                }
@@ -686,7 +664,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
        new = decode_device(inode, dev, gfp_flags);
        if (!new) {
-                printk(KERN_WARNING "%s: Could not decode or add device\n",
+                printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
                        __func__);
                return NULL;
        }
@@ -835,7 +813,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
        if (ds == NULL) {
-                printk(KERN_ERR "%s: No data server for offset index %d\n",
+                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
                        __func__, ds_idx);
                return NULL;
        }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index bb80c49b6533..9c8eca315f43 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -94,13 +94,14 @@ static int nfs4_validate_fspath(struct dentry *dentry,
 }
 static size_t nfs_parse_server_name(char *string, size_t len,
-                struct sockaddr *sa, size_t salen)
+                struct sockaddr *sa, size_t salen, struct nfs_server *server)
 {
+        struct net *net = rpc_net_ns(server->client);
        ssize_t ret;
-        ret = rpc_pton(string, len, sa, salen);
+        ret = rpc_pton(net, string, len, sa, salen);
        if (ret == 0) {
-                ret = nfs_dns_resolve_name(string, len, sa, salen);
+                ret = nfs_dns_resolve_name(net, string, len, sa, salen);
                if (ret < 0)
                        ret = 0;
        }
@@ -137,7 +138,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                        continue;
                mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
-                                mountdata->addr, addr_bufsize);
+                                mountdata->addr, addr_bufsize,
+                                NFS_SB(mountdata->sb));
                if (mountdata->addrlen == 0)
                        continue;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index caf92d05c3a9..e809d2305ebf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -72,18 +72,21 @@
 #define NFS4_MAX_LOOP_ON_RECOVER (10)
+static unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
                            struct nfs4_state *state);
 #ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
-static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
 #endif
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
@@ -259,15 +262,28 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state *state = exception->state;
+        struct inode *inode = exception->inode;
        int ret = errorcode;
        exception->retry = 0;
        switch(errorcode) {
                case 0:
                        return 0;
+                case -NFS4ERR_OPENMODE:
+                        if (nfs_have_delegation(inode, FMODE_READ)) {
+                                nfs_inode_return_delegation(inode);
+                                exception->retry = 1;
+                                return 0;
+                        }
+                        if (state == NULL)
+                                break;
+                        nfs4_schedule_stateid_recovery(server, state);
+                        goto wait_on_recovery;
+                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_BAD_STATEID:
-                case -NFS4ERR_OPENMODE:
+                        if (state != NULL)
+                                nfs_remove_bad_delegation(state->inode);
                        if (state == NULL)
                                break;
                        nfs4_schedule_stateid_recovery(server, state);
@@ -360,16 +376,14 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 * When updating highest_used_slotid there may be "holes" in the bitmap
 * so we need to scan down from highest_used_slotid to 0 looking for the now
 * highest slotid in use.
- * If none found, highest_used_slotid is set to -1.
+ * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
 *
 * Must be called while holding tbl->slot_tbl_lock
 */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)
 {
-        int slotid = free_slotid;
+        BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);
-        BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
        /* clear used bit in bitmap */
        __clear_bit(slotid, tbl->used_slots);
@@ -379,10 +393,16 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
                if (slotid < tbl->max_slots)
                        tbl->highest_used_slotid = slotid;
                else
-                        tbl->highest_used_slotid = -1;
+                        tbl->highest_used_slotid = NFS4_NO_SLOT;
        }
-        dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
+        dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
-                free_slotid, tbl->highest_used_slotid);
+                slotid, tbl->highest_used_slotid);
+}
+bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy)
+{
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        return true;
 }
 /*
@@ -390,16 +410,13 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
 */
 static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
-        struct rpc_task *task;
        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
-                task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
+                rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq,
-                if (task)
+                                nfs4_set_task_privileged, NULL);
-                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
                return;
        }
-        if (ses->fc_slot_table.highest_used_slotid != -1)
+        if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
                return;
        dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
@@ -412,7 +429,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
 {
        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
-            ses->bc_slot_table.highest_used_slotid != -1)
+            ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
                return;
        dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
        complete(&ses->bc_slot_table.complete);
@@ -507,25 +524,25 @@ static int nfs4_sequence_done(struct rpc_task *task,
 * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
 * If found, we mark the slot as used, update the highest_used_slotid,
 * and respectively set up the sequence operation args.
- * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
+ * The slot number is returned if found, or NFS4_NO_SLOT otherwise.
 *
 * Note: must be called with under the slot_tbl_lock.
 */
-static u8
+static u32
 nfs4_find_slot(struct nfs4_slot_table *tbl)
 {
-        int slotid;
+        u32 slotid;
-        u8 ret_id = NFS4_MAX_SLOT_TABLE;
+        u32 ret_id = NFS4_NO_SLOT;
-        BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
-        dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
+        dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
                tbl->max_slots);
        slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
        if (slotid >= tbl->max_slots)
                goto out;
        __set_bit(slotid, tbl->used_slots);
-        if (slotid > tbl->highest_used_slotid)
+        if (slotid > tbl->highest_used_slotid ||
+                        tbl->highest_used_slotid == NFS4_NO_SLOT)
                tbl->highest_used_slotid = slotid;
        ret_id = slotid;
 out:
@@ -534,15 +551,25 @@ out:
        return ret_id;
 }
+static void nfs41_init_sequence(struct nfs4_sequence_args *args,
+                struct nfs4_sequence_res *res, int cache_reply)
+{
+        args->sa_session = NULL;
+        args->sa_cache_this = 0;
+        if (cache_reply)
+                args->sa_cache_this = 1;
+        res->sr_session = NULL;
+        res->sr_slot = NULL;
+}
 int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
                                struct nfs4_sequence_res *res,
-                                int cache_reply,
                                struct rpc_task *task)
 {
        struct nfs4_slot *slot;
        struct nfs4_slot_table *tbl;
-        u8 slotid;
+        u32 slotid;
        dprintk("--> %s\n", __func__);
        /* slot already allocated? */
@@ -570,7 +597,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
        }
        slotid = nfs4_find_slot(tbl);
-        if (slotid == NFS4_MAX_SLOT_TABLE) {
+        if (slotid == NFS4_NO_SLOT) {
                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
                spin_unlock(&tbl->slot_tbl_lock);
                dprintk("<-- %s: no free slots\n", __func__);
@@ -582,7 +609,6 @@ int nfs41_setup_sequence(struct nfs4_session *session,
        slot = tbl->slots + slotid;
        args->sa_session = session;
        args->sa_slotid = slotid;
-        args->sa_cache_this = cache_reply;
        dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
@@ -602,24 +628,19 @@ EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
 int nfs4_setup_sequence(const struct nfs_server *server,
                        struct nfs4_sequence_args *args,
                        struct nfs4_sequence_res *res,
-                        int cache_reply,
                        struct rpc_task *task)
 {
        struct nfs4_session *session = nfs4_get_session(server);
        int ret = 0;
-        if (session == NULL) {
+        if (session == NULL)
-                args->sa_session = NULL;
-                res->sr_session = NULL;
                goto out;
-        }
        dprintk("--> %s clp %p session %p sr_slot %td\n",
                __func__, session->clp, session, res->sr_slot ?
                        res->sr_slot - session->fc_slot_table.slots : -1);
-        ret = nfs41_setup_sequence(session, args, res, cache_reply,
+        ret = nfs41_setup_sequence(session, args, res, task);
-                                   task);
 out:
        dprintk("<-- %s status=%d\n", __func__, ret);
        return ret;
@@ -629,7 +650,6 @@ struct nfs41_call_sync_data {
        const struct nfs_server *seq_server;
        struct nfs4_sequence_args *seq_args;
        struct nfs4_sequence_res *seq_res;
-        int cache_reply;
 };
 static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
@@ -639,7 +659,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
        dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
        if (nfs4_setup_sequence(data->seq_server, data->seq_args,
-                                data->seq_res, data->cache_reply, task))
+                                data->seq_res, task))
                return;
        rpc_call_start(task);
 }
@@ -657,12 +677,12 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
        nfs41_sequence_done(task, data->seq_res);
 }
-struct rpc_call_ops nfs41_call_sync_ops = {
+static const struct rpc_call_ops nfs41_call_sync_ops = {
        .rpc_call_prepare = nfs41_call_sync_prepare,
        .rpc_call_done = nfs41_call_sync_done,
 };
-struct rpc_call_ops nfs41_call_priv_sync_ops = {
+static const struct rpc_call_ops nfs41_call_priv_sync_ops = {
        .rpc_call_prepare = nfs41_call_priv_sync_prepare,
        .rpc_call_done = nfs41_call_sync_done,
 };
@@ -672,7 +692,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
-                                   int cache_reply,
                                   int privileged)
 {
        int ret;
@@ -681,7 +700,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
                .seq_server = server,
                .seq_args = args,
                .seq_res = res,
-                .cache_reply = cache_reply,
        };
        struct rpc_task_setup task_setup = {
                .rpc_client = clnt,
@@ -690,7 +708,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
                .callback_data = &data
        };
-        res->sr_slot = NULL;
        if (privileged)
                task_setup.callback_ops = &nfs41_call_priv_sync_ops;
        task = rpc_run_task(&task_setup);
@@ -710,10 +727,17 @@ int _nfs4_call_sync_session(struct rpc_clnt *clnt,
                            struct nfs4_sequence_res *res,
                            int cache_reply)
 {
-        return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0);
+        nfs41_init_sequence(args, res, cache_reply);
+        return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);
 }
 #else
+static inline
+void nfs41_init_sequence(struct nfs4_sequence_args *args,
+                struct nfs4_sequence_res *res, int cache_reply)
+{
+}
 static int nfs4_sequence_done(struct rpc_task *task,
                               struct nfs4_sequence_res *res)
 {
@@ -728,7 +752,7 @@ int _nfs4_call_sync(struct rpc_clnt *clnt,
                    struct nfs4_sequence_res *res,
                    int cache_reply)
 {
-        args->sa_session = res->sr_session = NULL;
+        nfs41_init_sequence(args, res, cache_reply);
        return rpc_call_sync(clnt, msg, 0);
 }
@@ -815,20 +839,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        p->o_arg.open_flags = flags;
        p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
        p->o_arg.clientid = server->nfs_client->cl_clientid;
-        p->o_arg.id = sp->so_owner_id.id;
+        p->o_arg.id = sp->so_seqid.owner_id;
        p->o_arg.name = &dentry->d_name;
        p->o_arg.server = server;
        p->o_arg.bitmask = server->attr_bitmask;
        p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
        p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
-        if (flags & O_CREAT) {
+        if (attrs != NULL && attrs->ia_valid != 0) {
-                u32 *s;
+                __be32 verf[2];
                p->o_arg.u.attrs = &p->attrs;
                memcpy(&p->attrs, attrs, sizeof(p->attrs));
-                s = (u32 *) p->o_arg.u.verifier.data;
-                s[0] = jiffies;
+                verf[0] = jiffies;
-                s[1] = current->pid;
+                verf[1] = current->pid;
+                memcpy(p->o_arg.u.verifier.data, verf,
+                                sizeof(p->o_arg.u.verifier.data));
        }
        p->c_arg.fh = &p->o_res.fh;
        p->c_arg.stateid = &p->o_res.stateid;
@@ -878,7 +904,7 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode
 {
        int ret = 0;
-        if (open_mode & O_EXCL)
+        if (open_mode & (O_EXCL|O_TRUNC))
                goto out;
        switch (mode & (FMODE_READ|FMODE_WRITE)) {
                case FMODE_READ:
@@ -927,8 +953,8 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
 static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
-                memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
+                nfs4_stateid_copy(&state->stateid, stateid);
-        memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
+        nfs4_stateid_copy(&state->open_stateid, stateid);
        switch (fmode) {
                case FMODE_READ:
                        set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -956,7 +982,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
         */
        write_seqlock(&state->seqlock);
        if (deleg_stateid != NULL) {
-                memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data));
+                nfs4_stateid_copy(&state->stateid, deleg_stateid);
                set_bit(NFS_DELEGATED_STATE, &state->flags);
        }
        if (open_stateid != NULL)
@@ -987,7 +1013,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
        if (delegation == NULL)
                delegation = &deleg_cur->stateid;
-        else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
+        else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation))
                goto no_delegation_unlock;
        nfs_mark_delegation_referenced(deleg_cur);
@@ -1026,7 +1052,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
        struct nfs4_state *state = opendata->state;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_delegation *delegation;
-        int open_mode = opendata->o_arg.open_flags & O_EXCL;
+        int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC);
        fmode_t fmode = opendata->o_arg.fmode;
        nfs4_stateid stateid;
        int ret = -EAGAIN;
@@ -1048,7 +1074,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                        break;
                }
                /* Save the delegation */
-                memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
+                nfs4_stateid_copy(&stateid, &delegation->stateid);
                rcu_read_unlock();
                ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
                if (ret != 0)
@@ -1090,6 +1116,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
        if (state == NULL)
                goto err_put_inode;
        if (data->o_res.delegation_type != 0) {
+                struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
                int delegation_flags = 0;
                rcu_read_lock();
@@ -1101,7 +1128,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
                        pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
                                        "returning a delegation for "
                                        "OPEN(CLAIM_DELEGATE_CUR)\n",
-                                        NFS_CLIENT(inode)->cl_server);
+                                        clp->cl_hostname);
                } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
                        nfs_inode_set_delegation(state->inode,
                                        data->owner->so_cred,
@@ -1210,10 +1237,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
         * Check if we need to update the current stateid.
         */
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
-            memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) {
+            !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {
                write_seqlock(&state->seqlock);
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
-                        memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data));
+                        nfs4_stateid_copy(&state->stateid, &state->open_stateid);
                write_sequnlock(&state->seqlock);
        }
        return 0;
@@ -1282,8 +1309,7 @@ static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs
        if (IS_ERR(opendata))
                return PTR_ERR(opendata);
        opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
-        memcpy(opendata->o_arg.u.delegation.data, stateid->data,
+        nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
-                        sizeof(opendata->o_arg.u.delegation.data));
        ret = nfs4_open_recover(opendata, state);
        nfs4_opendata_put(opendata);
        return ret;
@@ -1319,8 +1345,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                                 * The show must go on: exit, but mark the
                                 * stateid as needing recovery.
                                 */
+                        case -NFS4ERR_DELEG_REVOKED:
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
+                                nfs_inode_find_state_and_recover(state->inode,
+                                                stateid);
                                nfs4_schedule_stateid_recovery(server, state);
                        case -EKEYEXPIRED:
                                /*
@@ -1345,8 +1374,7 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
        data->rpc_status = task->tk_status;
        if (data->rpc_status == 0) {
-                memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
+                nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
-                                sizeof(data->o_res.stateid.data));
                nfs_confirm_seqid(&data->owner->so_seqid, 0);
                renew_lease(data->o_res.server, data->timestamp);
                data->rpc_done = 1;
@@ -1440,7 +1468,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
                rcu_read_unlock();
        }
        /* Update sequence id. */
-        data->o_arg.id = sp->so_owner_id.id;
+        data->o_arg.id = sp->so_seqid.owner_id;
        data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
        if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
@@ -1449,7 +1477,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        data->timestamp = jiffies;
        if (nfs4_setup_sequence(data->o_arg.server,
                                &data->o_arg.seq_args,
-                                &data->o_res.seq_res, 1, task))
+                                &data->o_res.seq_res, task))
                return;
        rpc_call_start(task);
        return;
@@ -1551,6 +1579,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
        };
        int status;
+        nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);
        kref_get(&data->kref);
        data->rpc_done = 0;
        data->rpc_status = 0;
@@ -1712,15 +1741,32 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
 }
 #if defined(CONFIG_NFS_V4_1)
-static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+static int nfs41_check_expired_stateid(struct nfs4_state *state, nfs4_stateid *stateid, unsigned int flags)
 {
-        int status;
+        int status = NFS_OK;
        struct nfs_server *server = NFS_SERVER(state->inode);
-        status = nfs41_test_stateid(server, state);
+        if (state->flags & flags) {
-        if (status == NFS_OK)
+                status = nfs41_test_stateid(server, stateid);
-                return 0;
+                if (status != NFS_OK) {
-        nfs41_free_stateid(server, state);
+                        nfs41_free_stateid(server, stateid);
+                        state->flags &= ~flags;
+                }
+        }
+        return status;
+}
+static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+        int deleg_status, open_status;
+        int deleg_flags = 1 << NFS_DELEGATED_STATE;
+        int open_flags = (1 << NFS_O_RDONLY_STATE) | (1 << NFS_O_WRONLY_STATE) | (1 << NFS_O_RDWR_STATE);
+        deleg_status = nfs41_check_expired_stateid(state, &state->stateid, deleg_flags);
+        open_status = nfs41_check_expired_stateid(state,  &state->open_stateid, open_flags);
+        if ((deleg_status == NFS_OK) && (open_status == NFS_OK))
+                return NFS_OK;
        return nfs4_open_expired(sp, state);
 }
 #endif
@@ -1754,7 +1800,8 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
        /* Protect against reboot recovery conflicts */
        status = -ENOMEM;
-        if (!(sp = nfs4_get_state_owner(server, cred))) {
+        sp = nfs4_get_state_owner(server, cred, GFP_KERNEL);
+        if (sp == NULL) {
                dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
                goto out_err;
        }
@@ -1829,7 +1876,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
                 * the user though...
                 */
                if (status == -NFS4ERR_BAD_SEQID) {
-                        printk(KERN_WARNING "NFS: v4 server %s "
+                        pr_warn_ratelimited("NFS: v4 server %s "
                                        " returned a bad sequence-id error!\n",
                                        NFS_SERVER(dir)->nfs_client->cl_hostname);
                        exception.retry = 1;
@@ -1882,12 +1929,14 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        nfs_fattr_init(fattr);
-        if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
+        if (state != NULL) {
+                nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
+                                current->files, current->tgid);
+        } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
+                                FMODE_WRITE)) {
                /* Use that stateid */
-        } else if (state != NULL) {
-                nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
        } else
-                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
+                nfs4_stateid_copy(&arg.stateid, &zero_stateid);
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (status == 0 && state != NULL)
@@ -1900,7 +1949,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                           struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs4_exception exception = { };
+        struct nfs4_exception exception = {
+                .state = state,
+                .inode = inode,
+        };
        int err;
        do {
                err = nfs4_handle_exception(server,
@@ -1954,6 +2006,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        struct nfs4_state *state = calldata->state;
        struct nfs_server *server = NFS_SERVER(calldata->inode);
+        dprintk("%s: begin!\n", __func__);
        if (!nfs4_sequence_done(task, &calldata->res.seq_res))
                return;
        /* hmm. we are done with the inode, and in the process of freeing
@@ -1981,6 +2034,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        }
        nfs_release_seqid(calldata->arg.seqid);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
+        dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
 }
 static void nfs4_close_prepare(struct rpc_task *task, void *data)
@@ -1989,6 +2043,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        struct nfs4_state *state = calldata->state;
        int call_close = 0;
+        dprintk("%s: begin!\n", __func__);
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
                return;
@@ -2013,7 +2068,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        if (!call_close) {
                /* Note: exit _without_ calling nfs4_close_done */
                task->tk_action = NULL;
-                return;
+                goto out;
        }
        if (calldata->arg.fmode == 0) {
@@ -2022,17 +2077,20 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
                        rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
                                     task, NULL);
-                        return;
+                        goto out;
                }
        }
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
        if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
-                                &calldata->arg.seq_args, &calldata->res.seq_res,
+                                &calldata->arg.seq_args,
-                                1, task))
+                                &calldata->res.seq_res,
-                return;
+                                task))
+                goto out;
        rpc_call_start(task);
+out:
+        dprintk("%s: done!\n", __func__);
 }
 static const struct rpc_call_ops nfs4_close_ops = {
@@ -2074,6 +2132,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
        calldata = kzalloc(sizeof(*calldata), gfp_mask);
        if (calldata == NULL)
                goto out;
+        nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);
        calldata->inode = state->inode;
        calldata->state = state;
        calldata->arg.fh = NFS_FH(state->inode);
@@ -2182,6 +2241,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
                server->acl_bitmask = res.acl_bitmask;
+                server->fh_expire_type = res.fh_expire_type;
        }
        return status;
@@ -2303,7 +2363,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
        return nfs4_map_errors(status);
 }
-static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 /*
 * Get locations and (maybe) other attributes of a referral.
 * Note that we'll actually follow the referral later when
@@ -2420,6 +2479,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
                }
        }
+        /* Deal with open(O_TRUNC) */
+        if (sattr->ia_valid & ATTR_OPEN)
+                sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
        status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
        if (status == 0)
                nfs_setattr_update_inode(inode, sattr);
@@ -2494,7 +2557,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_accessargs args = {
                .fh = NFS_FH(inode),
-                .bitmask = server->attr_bitmask,
+                .bitmask = server->cache_consistency_bitmask,
        };
        struct nfs4_accessres res = {
                .server = server,
@@ -2712,8 +2775,18 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        args->bitmask = server->cache_consistency_bitmask;
        res->server = server;
-        res->seq_res.sr_slot = NULL;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
+        nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+}
+static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+        if (nfs4_setup_sequence(NFS_SERVER(data->dir),
+                                &data->args.seq_args,
+                                &data->res.seq_res,
+                                task))
+                return;
+        rpc_call_start(task);
 }
 static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
@@ -2738,6 +2811,17 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
        arg->bitmask = server->attr_bitmask;
        res->server = server;
+        nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1);
+}
+static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+        if (nfs4_setup_sequence(NFS_SERVER(data->old_dir),
+                                &data->args.seq_args,
+                                &data->res.seq_res,
+                                task))
+                return;
+        rpc_call_start(task);
 }
 static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3232,6 +3316,17 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
        data->timestamp   = jiffies;
        data->read_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+        nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+}
+static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+{
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+                                &data->args.seq_args,
+                                &data->res.seq_res,
+                                task))
+                return;
+        rpc_call_start(task);
 }
 /* Reset the the nfs_read_data to send the read to the MDS. */
@@ -3305,6 +3400,17 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
        data->timestamp   = jiffies;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
+        nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+}
+static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+{
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+                                &data->args.seq_args,
+                                &data->res.seq_res,
+                                task))
+                return;
+        rpc_call_start(task);
 }
 static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3339,6 +3445,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
                data->write_done_cb = nfs4_commit_done_cb;
        data->res.server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+        nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 }
 struct nfs4_renewdata {
@@ -3714,8 +3821,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        if (task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
+                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_BAD_STATEID:
+                        if (state != NULL)
+                                nfs_remove_bad_delegation(state->inode);
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
@@ -3764,6 +3874,16 @@ wait_on_recovery:
        return -EAGAIN;
 }
+static void nfs4_construct_boot_verifier(struct nfs_client *clp,
+                                         nfs4_verifier *bootverf)
+{
+        __be32 verf[2];
+        verf[0] = htonl((u32)clp->cl_boot_time.tv_sec);
+        verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec);
+        memcpy(bootverf->data, verf, sizeof(bootverf->data));
+}
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                unsigned short port, struct rpc_cred *cred,
                struct nfs4_setclientid_res *res)
@@ -3780,15 +3900,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                .rpc_resp = res,
                .rpc_cred = cred,
        };
-        __be32 *p;
        int loop = 0;
        int status;
-        p = (__be32*)sc_verifier.data;
+        nfs4_construct_boot_verifier(clp, &sc_verifier);
-        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
-        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
        for(;;) {
+                rcu_read_lock();
                setclientid.sc_name_len = scnprintf(setclientid.sc_name,
                                sizeof(setclientid.sc_name), "%s/%s %s %s %u",
                                clp->cl_ipaddr,
@@ -3805,6 +3923,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
                                sizeof(setclientid.sc_uaddr), "%s.%u.%u",
                                clp->cl_ipaddr, port >> 8, port & 255);
+                rcu_read_unlock();
                status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
                if (status != -NFS4ERR_CLID_INUSE)
@@ -3891,7 +4010,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
        if (nfs4_setup_sequence(d_data->res.server,
                                &d_data->args.seq_args,
-                                &d_data->res.seq_res, 1, task))
+                                &d_data->res.seq_res, task))
                return;
        rpc_call_start(task);
 }
@@ -3925,11 +4044,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        data = kzalloc(sizeof(*data), GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
+        nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
        data->args.fhandle = &data->fh;
        data->args.stateid = &data->stateid;
        data->args.bitmask = server->attr_bitmask;
        nfs_copy_fh(&data->fh, NFS_FH(inode));
-        memcpy(&data->stateid, stateid, sizeof(data->stateid));
+        nfs4_stateid_copy(&data->stateid, stateid);
        data->res.fattr = &data->fattr;
        data->res.server = server;
        nfs_fattr_init(data->res.fattr);
@@ -4016,7 +4136,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        if (status != 0)
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
-        arg.lock_owner.id = lsp->ls_id.id;
+        arg.lock_owner.id = lsp->ls_seqid.owner_id;
        arg.lock_owner.s_dev = server->s_dev;
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        switch (status) {
@@ -4112,9 +4232,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                return;
        switch (task->tk_status) {
                case 0:
-                        memcpy(calldata->lsp->ls_stateid.data,
+                        nfs4_stateid_copy(&calldata->lsp->ls_stateid,
-                                        calldata->res.stateid.data,
+                                        &calldata->res.stateid);
-                                        sizeof(calldata->lsp->ls_stateid.data));
                        renew_lease(calldata->server, calldata->timestamp);
                        break;
                case -NFS4ERR_BAD_STATEID:
@@ -4142,7 +4261,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
        calldata->timestamp = jiffies;
        if (nfs4_setup_sequence(calldata->server,
                                &calldata->arg.seq_args,
-                                &calldata->res.seq_res, 1, task))
+                                &calldata->res.seq_res, task))
                return;
        rpc_call_start(task);
 }
@@ -4182,6 +4301,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                return ERR_PTR(-ENOMEM);
        }
+        nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
        msg.rpc_argp = &data->arg;
        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
@@ -4261,7 +4381,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
                goto out_free_seqid;
        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
-        p->arg.lock_owner.id = lsp->ls_id.id;
+        p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
        p->arg.lock_owner.s_dev = server->s_dev;
        p->res.lock_seqid = p->arg.lock_seqid;
        p->lsp = lsp;
@@ -4297,7 +4417,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        data->timestamp = jiffies;
        if (nfs4_setup_sequence(data->server,
                                &data->arg.seq_args,
-                                &data->res.seq_res, 1, task))
+                                &data->res.seq_res, task))
                return;
        rpc_call_start(task);
        dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
@@ -4326,8 +4446,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
                        goto out;
        }
        if (data->rpc_status == 0) {
-                memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
+                nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
-                                        sizeof(data->lsp->ls_stateid.data));
                data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
                renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
        }
@@ -4415,6 +4534,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
        }
+        nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
        msg.rpc_argp = &data->arg;
        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
@@ -4479,15 +4599,34 @@ out:
 }
 #if defined(CONFIG_NFS_V4_1)
-static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+static int nfs41_check_expired_locks(struct nfs4_state *state)
 {
-        int status;
+        int status, ret = NFS_OK;
+        struct nfs4_lock_state *lsp;
        struct nfs_server *server = NFS_SERVER(state->inode);
-        status = nfs41_test_stateid(server, state);
+        list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+                if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
+                        status = nfs41_test_stateid(server, &lsp->ls_stateid);
+                        if (status != NFS_OK) {
+                                nfs41_free_stateid(server, &lsp->ls_stateid);
+                                lsp->ls_flags &= ~NFS_LOCK_INITIALIZED;
+                                ret = status;
+                        }
+                }
+        };
+        return ret;
+}
+static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+        int status = NFS_OK;
+        if (test_bit(LK_STATE_IN_USE, &state->flags))
+                status = nfs41_check_expired_locks(state);
        if (status == NFS_OK)
-                return 0;
+                return status;
-        nfs41_free_stateid(server, state);
        return nfs4_lock_expired(state, request);
 }
 #endif
@@ -4523,7 +4662,8 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        /* Note: we always want to sleep here! */
        request->fl_flags = fl_flags | FL_SLEEP;
        if (do_vfs_lock(request->fl_file, request) < 0)
-                printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
+                printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
+                        "manager!\n", __func__);
 out_unlock:
        up_read(&nfsi->rwsem);
 out:
@@ -4533,7 +4673,9 @@ out:
 static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs4_exception exception = { };
+        struct nfs4_exception exception = {
+                .state = state,
+        };
        int err;
        do {
@@ -4603,8 +4745,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
                switch (err) {
                        default:
-                                printk(KERN_ERR "%s: unhandled error %d.\n",
+                                printk(KERN_ERR "NFS: %s: unhandled error "
-                                                __func__, err);
+                                        "%d.\n", __func__, err);
                        case 0:
                        case -ESTALE:
                                goto out;
@@ -4626,6 +4768,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                                 * The show must go on: exit, but mark the
                                 * stateid as needing recovery.
                                 */
+                        case -NFS4ERR_DELEG_REVOKED:
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_OPENMODE:
@@ -4655,33 +4798,44 @@ out:
        return err;
 }
+struct nfs_release_lockowner_data {
+        struct nfs4_lock_state *lsp;
+        struct nfs_server *server;
+        struct nfs_release_lockowner_args args;
+};
 static void nfs4_release_lockowner_release(void *calldata)
 {
+        struct nfs_release_lockowner_data *data = calldata;
+        nfs4_free_lock_state(data->server, data->lsp);
        kfree(calldata);
 }
-const struct rpc_call_ops nfs4_release_lockowner_ops = {
+static const struct rpc_call_ops nfs4_release_lockowner_ops = {
        .rpc_release = nfs4_release_lockowner_release,
 };
-void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+int nfs4_release_lockowner(struct nfs4_lock_state *lsp)
 {
        struct nfs_server *server = lsp->ls_state->owner->so_server;
-        struct nfs_release_lockowner_args *args;
+        struct nfs_release_lockowner_data *data;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
        };
        if (server->nfs_client->cl_mvops->minor_version != 0)
-                return;
+                return -EINVAL;
-        args = kmalloc(sizeof(*args), GFP_NOFS);
+        data = kmalloc(sizeof(*data), GFP_NOFS);
-        if (!args)
+        if (!data)
-                return;
+                return -ENOMEM;
-        args->lock_owner.clientid = server->nfs_client->cl_clientid;
+        data->lsp = lsp;
-        args->lock_owner.id = lsp->ls_id.id;
+        data->server = server;
-        args->lock_owner.s_dev = server->s_dev;
+        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
-        msg.rpc_argp = args;
+        data->args.lock_owner.id = lsp->ls_seqid.owner_id;
-        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+        data->args.lock_owner.s_dev = server->s_dev;
+        msg.rpc_argp = &data->args;
+        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
+        return 0;
 }
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -4727,11 +4881,11 @@ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
        if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
               (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
              (fattr->valid & NFS_ATTR_FATTR_FSID) &&
-              (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+              (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))
                return;
        fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
-                NFS_ATTR_FATTR_NLINK;
+                NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;
        fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
        fattr->nlink = 2;
 }
@@ -4798,7 +4952,8 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
        return status;
 }
-int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
+static int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
+                struct nfs4_secinfo_flavors *flavors)
 {
        struct nfs4_exception exception = { };
        int err;
@@ -4852,6 +5007,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 {
        nfs4_verifier verifier;
        struct nfs41_exchange_id_args args = {
+                .verifier = &verifier,
                .client = clp,
                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
        };
@@ -4865,15 +5021,11 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                .rpc_resp = &res,
                .rpc_cred = cred,
        };
-        __be32 *p;
        dprintk("--> %s\n", __func__);
        BUG_ON(clp == NULL);
-        p = (u32 *)verifier.data;
+        nfs4_construct_boot_verifier(clp, &verifier);
-        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
-        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
-        args.verifier = &verifier;
        args.id_len = scnprintf(args.id, sizeof(args.id),
                                "%s/%s.%s/%u",
@@ -4888,11 +5040,24 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                goto out;
        }
+        res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL);
+        if (unlikely(!res.impl_id)) {
+                status = -ENOMEM;
+                goto out_server_scope;
+        }
        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        if (!status)
                status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        if (!status) {
+                /* use the most recent implementation id */
+                kfree(clp->impl_id);
+                clp->impl_id = res.impl_id;
+        } else
+                kfree(res.impl_id);
+        if (!status) {
                if (clp->server_scope &&
                    !nfs41_same_server_scope(clp->server_scope,
                                             res.server_scope)) {
@@ -4908,8 +5073,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                        goto out;
                }
        }
+out_server_scope:
        kfree(res.server_scope);
 out:
+        if (clp->impl_id)
+                dprintk("%s: Server Implementation ID: "
+                        "domain: %s, name: %s, date: %llu,%u\n",
+                        __func__, clp->impl_id->domain, clp->impl_id->name,
+                        clp->impl_id->date.seconds,
+                        clp->impl_id->date.nseconds);
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -4933,7 +5106,7 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
           since we're invoked within one */
        ret = nfs41_setup_sequence(data->clp->cl_session,
                                   &data->args->la_seq_args,
-                                   &data->res->lr_seq_res, 0, task);
+                                   &data->res->lr_seq_res, task);
        BUG_ON(ret == -EAGAIN);
        rpc_call_start(task);
@@ -4966,7 +5139,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
        dprintk("<-- %s\n", __func__);
 }
-struct rpc_call_ops nfs4_get_lease_time_ops = {
+static const struct rpc_call_ops nfs4_get_lease_time_ops = {
        .rpc_call_prepare = nfs4_get_lease_time_prepare,
        .rpc_call_done = nfs4_get_lease_time_done,
 };
@@ -4997,6 +5170,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
        };
        int status;
+        nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
        dprintk("--> %s\n", __func__);
        task = rpc_run_task(&task_setup);
@@ -5113,13 +5287,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
                return NULL;
        tbl = &session->fc_slot_table;
-        tbl->highest_used_slotid = -1;
+        tbl->highest_used_slotid = NFS4_NO_SLOT;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
        init_completion(&tbl->complete);
        tbl = &session->bc_slot_table;
-        tbl->highest_used_slotid = -1;
+        tbl->highest_used_slotid = NFS4_NO_SLOT;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
        init_completion(&tbl->complete);
@@ -5132,11 +5306,16 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 void nfs4_destroy_session(struct nfs4_session *session)
 {
+        struct rpc_xprt *xprt;
        nfs4_proc_destroy_session(session);
+        rcu_read_lock();
+        xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
+        rcu_read_unlock();
        dprintk("%s Destroy backchannel for xprt %p\n",
-                __func__, session->clp->cl_rpcclient->cl_xprt);
+                __func__, xprt);
-        xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
+        xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
-                                NFS41_BC_MIN_CALLBACKS);
        nfs4_destroy_slot_tables(session);
        kfree(session);
 }
@@ -5164,7 +5343,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
        args->fc_attrs.max_rqst_sz = mxrqst_sz;
        args->fc_attrs.max_resp_sz = mxresp_sz;
        args->fc_attrs.max_ops = NFS4_MAX_OPS;
-        args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+        args->fc_attrs.max_reqs = max_session_slots;
        dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
                "max_ops=%u max_reqs=%u\n",
@@ -5204,6 +5383,8 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
                return -EINVAL;
        if (rcvd->max_reqs == 0)
                return -EINVAL;
+        if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE)
+                rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;
        return 0;
 }
@@ -5219,9 +5400,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
        if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
                return -EINVAL;
        /* These would render the backchannel useless: */
-        if (rcvd->max_ops  == 0)
+        if (rcvd->max_ops != sent->max_ops)
                return -EINVAL;
-        if (rcvd->max_reqs == 0)
+        if (rcvd->max_reqs != sent->max_reqs)
                return -EINVAL;
        return 0;
 }
@@ -5324,7 +5505,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
        if (status)
                printk(KERN_WARNING
-                        "Got error %d from the server on DESTROY_SESSION. "
+                        "NFS: Got error %d from the server on DESTROY_SESSION. "
                        "Session has been destroyed regardless...\n", status);
        dprintk("<-- nfs4_proc_destroy_session\n");
@@ -5447,7 +5628,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
        args = task->tk_msg.rpc_argp;
        res = task->tk_msg.rpc_resp;
-        if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
+        if (nfs41_setup_sequence(clp->cl_session, args, res, task))
                return;
        rpc_call_start(task);
 }
@@ -5479,6 +5660,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
                nfs_put_client(clp);
                return ERR_PTR(-ENOMEM);
        }
+        nfs41_init_sequence(&calldata->args, &calldata->res, 0);
        msg.rpc_argp = &calldata->args;
        msg.rpc_resp = &calldata->res;
        calldata->clp = clp;
@@ -5540,7 +5722,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
        if (nfs41_setup_sequence(calldata->clp->cl_session,
                                &calldata->arg.seq_args,
-                                &calldata->res.seq_res, 0, task))
+                                &calldata->res.seq_res, task))
                return;
        rpc_call_start(task);
@@ -5619,6 +5801,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        calldata->clp = clp;
        calldata->arg.one_fs = 0;
+        nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
        msg.rpc_argp = &calldata->arg;
        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
@@ -5650,7 +5833,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
         * to be no way to prevent it completely.
         */
        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
-                                &lgp->res.seq_res, 0, task))
+                                &lgp->res.seq_res, task))
                return;
        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
                                          NFS_I(lgp->args.inode)->layout,
@@ -5725,6 +5908,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        lgp->res.layoutp = &lgp->args.layout;
        lgp->res.seq_res.sr_slot = NULL;
+        nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -5745,7 +5929,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
        dprintk("--> %s\n", __func__);
        if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
-                                &lrp->res.seq_res, 0, task))
+                                &lrp->res.seq_res, task))
                return;
        rpc_call_start(task);
 }
@@ -5811,6 +5995,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
        int status;
        dprintk("--> %s\n", __func__);
+        nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -5911,7 +6096,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
        struct nfs_server *server = NFS_SERVER(data->args.inode);
        if (nfs4_setup_sequence(server, &data->args.seq_args,
-                                &data->res.seq_res, 1, task))
+                                &data->res.seq_res, task))
                return;
        rpc_call_start(task);
 }
@@ -5998,6 +6183,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
                data->args.lastbytewritten,
                data->args.inode->i_ino);
+        nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -6091,11 +6277,12 @@ out_freepage:
 out:
        return err;
 }
-static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
+static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 {
        int status;
        struct nfs41_test_stateid_args args = {
-                .stateid = &state->stateid,
+                .stateid = stateid,
        };
        struct nfs41_test_stateid_res res;
        struct rpc_message msg = {
@@ -6103,28 +6290,31 @@ static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *sta
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        args.seq_args.sa_session = res.seq_res.sr_session = NULL;
-        status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
+        nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
+        status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+        if (status == NFS_OK)
+                return res.status;
        return status;
 }
-static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
+static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(server,
-                                _nfs41_test_stateid(server, state),
+                                _nfs41_test_stateid(server, stateid),
                                &exception);
        } while (exception.retry);
        return err;
 }
-static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state)
+static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 {
-        int status;
        struct nfs41_free_stateid_args args = {
-                .stateid = &state->stateid,
+                .stateid = stateid,
        };
        struct nfs41_free_stateid_res res;
        struct rpc_message msg = {
@@ -6133,25 +6323,46 @@ static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *stat
                .rpc_resp = &res,
        };
-        args.seq_args.sa_session = res.seq_res.sr_session = NULL;
+        nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
-        status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
+        return nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
-        return status;
 }
-static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state)
+static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(server,
-                                _nfs4_free_stateid(server, state),
+                                _nfs4_free_stateid(server, stateid),
                                &exception);
        } while (exception.retry);
        return err;
 }
+static bool nfs41_match_stateid(const nfs4_stateid *s1,
+                const nfs4_stateid *s2)
+{
+        if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
+                return false;
+        if (s1->seqid == s2->seqid)
+                return true;
+        if (s1->seqid == 0 || s2->seqid == 0)
+                return true;
+        return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
-struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
+static bool nfs4_match_stateid(const nfs4_stateid *s1,
+                const nfs4_stateid *s2)
+{
+        return nfs4_stateid_match(s1, s2);
+}
+static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
@@ -6161,7 +6372,7 @@ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
 };
 #if defined(CONFIG_NFS_V4_1)
-struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
@@ -6172,7 +6383,7 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
 };
 #endif /* CONFIG_NFS_V4_1 */
-struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs4_open_expired,
@@ -6182,7 +6393,7 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
 };
 #if defined(CONFIG_NFS_V4_1)
-struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
+static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs41_open_expired,
@@ -6192,14 +6403,14 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
 };
 #endif /* CONFIG_NFS_V4_1 */
-struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
        .sched_state_renewal = nfs4_proc_async_renew,
        .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
        .renew_lease = nfs4_proc_renew,
 };
 #if defined(CONFIG_NFS_V4_1)
-struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
        .sched_state_renewal = nfs41_proc_async_sequence,
        .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
        .renew_lease = nfs4_proc_sequence,
@@ -6209,7 +6420,7 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
        .minor_version = 0,
        .call_sync = _nfs4_call_sync,
-        .validate_stateid = nfs4_validate_delegation_stateid,
+        .match_stateid = nfs4_match_stateid,
        .find_root_sec = nfs4_find_root_sec,
        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -6220,7 +6431,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .minor_version = 1,
        .call_sync = _nfs4_call_sync_session,
-        .validate_stateid = nfs41_validate_delegation_stateid,
+        .match_stateid = nfs41_match_stateid,
        .find_root_sec = nfs41_find_root_sec,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -6260,9 +6471,11 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .create         = nfs4_proc_create,
        .remove         = nfs4_proc_remove,
        .unlink_setup   = nfs4_proc_unlink_setup,
+        .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
        .unlink_done    = nfs4_proc_unlink_done,
        .rename         = nfs4_proc_rename,
        .rename_setup   = nfs4_proc_rename_setup,
+        .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
        .rename_done    = nfs4_proc_rename_done,
        .link           = nfs4_proc_link,
        .symlink        = nfs4_proc_symlink,
@@ -6276,8 +6489,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .set_capabilities = nfs4_server_capabilities,
        .decode_dirent  = nfs4_decode_dirent,
        .read_setup     = nfs4_proc_read_setup,
+        .read_rpc_prepare = nfs4_proc_read_rpc_prepare,
        .read_done      = nfs4_read_done,
        .write_setup    = nfs4_proc_write_setup,
+        .write_rpc_prepare = nfs4_proc_write_rpc_prepare,
        .write_done     = nfs4_write_done,
        .commit_setup   = nfs4_proc_commit_setup,
        .commit_done    = nfs4_commit_done,
@@ -6301,6 +6516,10 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
        NULL
 };
+module_param(max_session_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
+                "requests the client will negotiate");
 /*
 * Local variables:
 *  c-basic-offset: 8
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 45392032e7bd..0f43414eb25a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -146,6 +146,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        struct rpc_cred *cred = NULL;
        struct nfs_server *server;
+        /* Use machine credentials if available */
+        cred = nfs4_get_machine_cred_locked(clp);
+        if (cred != NULL)
+                goto out;
        rcu_read_lock();
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
                cred = nfs4_get_renew_cred_server_locked(server);
@@ -153,6 +158,8 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
                        break;
        }
        rcu_read_unlock();
+out:
        return cred;
 }
@@ -190,30 +197,29 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
        struct nfs4_session *ses = clp->cl_session;
+        struct nfs4_slot_table *tbl;
        int max_slots;
        if (ses == NULL)
                return;
+        tbl = &ses->fc_slot_table;
        if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
-                spin_lock(&ses->fc_slot_table.slot_tbl_lock);
+                spin_lock(&tbl->slot_tbl_lock);
-                max_slots = ses->fc_slot_table.max_slots;
+                max_slots = tbl->max_slots;
                while (max_slots--) {
-                        struct rpc_task *task;
+                        if (rpc_wake_up_first(&tbl->slot_tbl_waitq,
+                                                nfs4_set_task_privileged,
-                        task = rpc_wake_up_next(&ses->fc_slot_table.
+                                                NULL) == NULL)
-                                                slot_tbl_waitq);
-                        if (!task)
                                break;
-                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
                }
-                spin_unlock(&ses->fc_slot_table.slot_tbl_lock);
+                spin_unlock(&tbl->slot_tbl_lock);
        }
 }
 static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
        spin_lock(&tbl->slot_tbl_lock);
-        if (tbl->highest_used_slotid != -1) {
+        if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
                INIT_COMPLETION(tbl->complete);
                spin_unlock(&tbl->slot_tbl_lock);
                return wait_for_completion_interruptible(&tbl->complete);
@@ -317,62 +323,6 @@ out:
        return cred;
 }
-static void nfs_alloc_unique_id_locked(struct rb_root *root,
-                                       struct nfs_unique_id *new,
-                                       __u64 minval, int maxbits)
-{
-        struct rb_node **p, *parent;
-        struct nfs_unique_id *pos;
-        __u64 mask = ~0ULL;
-        if (maxbits < 64)
-                mask = (1ULL << maxbits) - 1ULL;
-        /* Ensure distribution is more or less flat */
-        get_random_bytes(&new->id, sizeof(new->id));
-        new->id &= mask;
-        if (new->id < minval)
-                new->id += minval;
-retry:
-        p = &root->rb_node;
-        parent = NULL;
-        while (*p != NULL) {
-                parent = *p;
-                pos = rb_entry(parent, struct nfs_unique_id, rb_node);
-                if (new->id < pos->id)
-                        p = &(*p)->rb_left;
-                else if (new->id > pos->id)
-                        p = &(*p)->rb_right;
-                else
-                        goto id_exists;
-        }
-        rb_link_node(&new->rb_node, parent, p);
-        rb_insert_color(&new->rb_node, root);
-        return;
-id_exists:
-        for (;;) {
-                new->id++;
-                if (new->id < minval || (new->id & mask) != new->id) {
-                        new->id = minval;
-                        break;
-                }
-                parent = rb_next(parent);
-                if (parent == NULL)
-                        break;
-                pos = rb_entry(parent, struct nfs_unique_id, rb_node);
-                if (new->id < pos->id)
-                        break;
-        }
-        goto retry;
-}
-static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
-{
-        rb_erase(&id->rb_node, root);
-}
 static struct nfs4_state_owner *
 nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
@@ -405,6 +355,7 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
        struct rb_node **p = &server->state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp;
+        int err;
        while (*p != NULL) {
                parent = *p;
@@ -421,8 +372,9 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
                        return sp;
                }
        }
-        nfs_alloc_unique_id_locked(&server->openowner_id,
+        err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id);
-                                        &new->so_owner_id, 1, 64);
+        if (err)
+                return ERR_PTR(err);
        rb_link_node(&new->so_server_node, parent, p);
        rb_insert_color(&new->so_server_node, &server->state_owners);
        return new;
@@ -435,7 +387,23 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
        if (!RB_EMPTY_NODE(&sp->so_server_node))
                rb_erase(&sp->so_server_node, &server->state_owners);
-        nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
+        ida_remove(&server->openowner_id, sp->so_seqid.owner_id);
+}
+static void
+nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)
+{
+        sc->flags = 0;
+        sc->counter = 0;
+        spin_lock_init(&sc->lock);
+        INIT_LIST_HEAD(&sc->list);
+        rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue");
+}
+static void
+nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc)
+{
+        rpc_destroy_wait_queue(&sc->wait);
 }
 /*
@@ -444,19 +412,20 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 *
 */
 static struct nfs4_state_owner *
-nfs4_alloc_state_owner(void)
+nfs4_alloc_state_owner(struct nfs_server *server,
+                struct rpc_cred *cred,
+                gfp_t gfp_flags)
 {
        struct nfs4_state_owner *sp;
-        sp = kzalloc(sizeof(*sp),GFP_NOFS);
+        sp = kzalloc(sizeof(*sp), gfp_flags);
        if (!sp)
                return NULL;
+        sp->so_server = server;
+        sp->so_cred = get_rpccred(cred);
        spin_lock_init(&sp->so_lock);
        INIT_LIST_HEAD(&sp->so_states);
-        rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
+        nfs4_init_seqid_counter(&sp->so_seqid);
-        sp->so_seqid.sequence = &sp->so_sequence;
-        spin_lock_init(&sp->so_sequence.lock);
-        INIT_LIST_HEAD(&sp->so_sequence.list);
        atomic_set(&sp->so_count, 1);
        INIT_LIST_HEAD(&sp->so_lru);
        return sp;
@@ -478,7 +447,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
 {
-        rpc_destroy_wait_queue(&sp->so_sequence.wait);
+        nfs4_destroy_seqid_counter(&sp->so_seqid);
        put_rpccred(sp->so_cred);
        kfree(sp);
 }
@@ -516,7 +485,8 @@ static void nfs4_gc_state_owners(struct nfs_server *server)
 * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
 */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
-                                              struct rpc_cred *cred)
+                                              struct rpc_cred *cred,
+                                              gfp_t gfp_flags)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp, *new;
@@ -526,20 +496,18 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
        spin_unlock(&clp->cl_lock);
        if (sp != NULL)
                goto out;
-        new = nfs4_alloc_state_owner();
+        new = nfs4_alloc_state_owner(server, cred, gfp_flags);
        if (new == NULL)
                goto out;
-        new->so_server = server;
+        do {
-        new->so_cred = cred;
+                if (ida_pre_get(&server->openowner_id, gfp_flags) == 0)
-        spin_lock(&clp->cl_lock);
+                        break;
-        sp = nfs4_insert_state_owner_locked(new);
+                spin_lock(&clp->cl_lock);
-        spin_unlock(&clp->cl_lock);
+                sp = nfs4_insert_state_owner_locked(new);
-        if (sp == new)
+                spin_unlock(&clp->cl_lock);
-                get_rpccred(cred);
+        } while (sp == ERR_PTR(-EAGAIN));
-        else {
+        if (sp != new)
-                rpc_destroy_wait_queue(&new->so_sequence.wait);
+                nfs4_free_state_owner(new);
-                kfree(new);
-        }
 out:
        nfs4_gc_state_owners(server);
        return sp;
@@ -795,15 +763,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 {
        struct nfs4_lock_state *lsp;
        struct nfs_server *server = state->owner->so_server;
-        struct nfs_client *clp = server->nfs_client;
        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
                return NULL;
-        rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
+        nfs4_init_seqid_counter(&lsp->ls_seqid);
-        spin_lock_init(&lsp->ls_sequence.lock);
-        INIT_LIST_HEAD(&lsp->ls_sequence.list);
-        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
        lsp->ls_state = state;
        lsp->ls_owner.lo_type = type;
@@ -815,25 +779,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
                lsp->ls_owner.lo_u.posix_owner = fl_owner;
                break;
        default:
-                kfree(lsp);
+                goto out_free;
-                return NULL;
        }
-        spin_lock(&clp->cl_lock);
+        lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
-        nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
+        if (lsp->ls_seqid.owner_id < 0)
-        spin_unlock(&clp->cl_lock);
+                goto out_free;
        INIT_LIST_HEAD(&lsp->ls_locks);
        return lsp;
+out_free:
+        kfree(lsp);
+        return NULL;
 }
-static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
+void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
-        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id);
-        struct nfs_client *clp = server->nfs_client;
+        nfs4_destroy_seqid_counter(&lsp->ls_seqid);
-        spin_lock(&clp->cl_lock);
-        nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
-        spin_unlock(&clp->cl_lock);
-        rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
        kfree(lsp);
 }
@@ -865,7 +826,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
        }
        spin_unlock(&state->state_lock);
        if (new != NULL)
-                nfs4_free_lock_state(new);
+                nfs4_free_lock_state(state->owner->so_server, new);
        return lsp;
 }
@@ -886,9 +847,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
-        if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
+        if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
-                nfs4_release_lockowner(lsp);
+                if (nfs4_release_lockowner(lsp) == 0)
-        nfs4_free_lock_state(lsp);
+                        return;
+        }
+        nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp);
 }
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -918,7 +881,8 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
        if (fl->fl_flags & FL_POSIX)
                lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
        else if (fl->fl_flags & FL_FLOCK)
-                lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
+                lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
+                                NFS4_FLOCK_LOCK_TYPE);
        else
                return -EINVAL;
        if (lsp == NULL)
@@ -928,28 +892,49 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
        return 0;
 }
-/*
+static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
- * Byte-range lock aware utility to initialize the stateid of read/write
+                fl_owner_t fl_owner, pid_t fl_pid)
- * requests.
- */
-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
 {
        struct nfs4_lock_state *lsp;
-        int seq;
+        bool ret = false;
-        do {
-                seq = read_seqbegin(&state->seqlock);
-                memcpy(dst, &state->stateid, sizeof(*dst));
-        } while (read_seqretry(&state->seqlock, seq));
        if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
-                return;
+                goto out;
        spin_lock(&state->state_lock);
        lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
-        if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+        if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) {
-                memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
+                nfs4_stateid_copy(dst, &lsp->ls_stateid);
+                ret = true;
+        }
        spin_unlock(&state->state_lock);
        nfs4_put_lock_state(lsp);
+out:
+        return ret;
+}
+static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+{
+        int seq;
+        do {
+                seq = read_seqbegin(&state->seqlock);
+                nfs4_stateid_copy(dst, &state->stateid);
+        } while (read_seqretry(&state->seqlock, seq));
+}
+/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
+ */
+void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+                fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid)
+{
+        if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
+                return;
+        if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid))
+                return;
+        nfs4_copy_open_stateid(dst, state);
 }
 struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
@@ -960,20 +945,28 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
        if (new != NULL) {
                new->sequence = counter;
                INIT_LIST_HEAD(&new->list);
+                new->task = NULL;
        }
        return new;
 }
 void nfs_release_seqid(struct nfs_seqid *seqid)
 {
-        if (!list_empty(&seqid->list)) {
+        struct nfs_seqid_counter *sequence;
-                struct rpc_sequence *sequence = seqid->sequence->sequence;
-                spin_lock(&sequence->lock);
+        if (list_empty(&seqid->list))
-                list_del_init(&seqid->list);
+                return;
-                spin_unlock(&sequence->lock);
+        sequence = seqid->sequence;
-                rpc_wake_up(&sequence->wait);
+        spin_lock(&sequence->lock);
+        list_del_init(&seqid->list);
+        if (!list_empty(&sequence->list)) {
+                struct nfs_seqid *next;
+                next = list_first_entry(&sequence->list,
+                                struct nfs_seqid, list);
+                rpc_wake_up_queued_task(&sequence->wait, next->task);
        }
+        spin_unlock(&sequence->lock);
 }
 void nfs_free_seqid(struct nfs_seqid *seqid)
@@ -989,14 +982,14 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
 */
 static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 {
-        BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);
+        BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);
        switch (status) {
                case 0:
                        break;
                case -NFS4ERR_BAD_SEQID:
                        if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
                                return;
-                        printk(KERN_WARNING "NFS: v4 server returned a bad"
+                        pr_warn_ratelimited("NFS: v4 server returned a bad"
                                        " sequence-id error on an"
                                        " unconfirmed sequence %p!\n",
                                        seqid->sequence);
@@ -1040,10 +1033,11 @@ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
 int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
 {
-        struct rpc_sequence *sequence = seqid->sequence->sequence;
+        struct nfs_seqid_counter *sequence = seqid->sequence;
        int status = 0;
        spin_lock(&sequence->lock);
+        seqid->task = task;
        if (list_empty(&seqid->list))
                list_add_tail(&seqid->list, &sequence->list);
        if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
@@ -1072,19 +1066,28 @@ static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
 void nfs4_schedule_state_manager(struct nfs_client *clp)
 {
        struct task_struct *task;
+        char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
        if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
                return;
        __module_get(THIS_MODULE);
        atomic_inc(&clp->cl_count);
-        task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
-                                rpc_peeraddr2str(clp->cl_rpcclient,
+        /* The rcu_read_lock() is not strictly necessary, as the state
-                                                        RPC_DISPLAY_ADDR));
+         * manager is the only thread that ever changes the rpc_xprt
-        if (!IS_ERR(task))
+         * after it's initialized.  At this point, we're single threaded. */
-                return;
+        rcu_read_lock();
-        nfs4_clear_state_manager_bit(clp);
+        snprintf(buf, sizeof(buf), "%s-manager",
-        nfs_put_client(clp);
+                        rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        module_put(THIS_MODULE);
+        rcu_read_unlock();
+        task = kthread_run(nfs4_run_state_manager, clp, buf);
+        if (IS_ERR(task)) {
+                printk(KERN_ERR "%s: kthread_run: %ld\n",
+                        __func__, PTR_ERR(task));
+                nfs4_clear_state_manager_bit(clp);
+                nfs_put_client(clp);
+                module_put(THIS_MODULE);
+        }
 }
 /*
@@ -1098,10 +1101,25 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
                set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
        nfs4_schedule_state_manager(clp);
 }
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
+/*
+ * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
+ * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a
+ * resend of the SETCLIENTID and hence re-establish the
+ * callback channel. Then return all existing delegations.
+ */
+static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
+{
+        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+        nfs_expire_all_delegations(clp);
+}
 void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
 {
-        nfs_handle_cb_pathdown(clp);
+        nfs40_handle_cb_pathdown(clp);
        nfs4_schedule_state_manager(clp);
 }
@@ -1132,11 +1150,37 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
 {
        struct nfs_client *clp = server->nfs_client;
-        if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags))
-                nfs_async_inode_return_delegation(state->inode, &state->stateid);
        nfs4_state_mark_reclaim_nograce(clp, state);
        nfs4_schedule_state_manager(clp);
 }
+EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
+void nfs_inode_find_state_and_recover(struct inode *inode,
+                const nfs4_stateid *stateid)
+{
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_open_context *ctx;
+        struct nfs4_state *state;
+        bool found = false;
+        spin_lock(&inode->i_lock);
+        list_for_each_entry(ctx, &nfsi->open_files, list) {
+                state = ctx->state;
+                if (state == NULL)
+                        continue;
+                if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+                        continue;
+                if (!nfs4_stateid_match(&state->stateid, stateid))
+                        continue;
+                nfs4_state_mark_reclaim_nograce(clp, state);
+                found = true;
+        }
+        spin_unlock(&inode->i_lock);
+        if (found)
+                nfs4_schedule_state_manager(clp);
+}
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
@@ -1175,8 +1219,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                                goto out;
                        default:
-                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
+                                printk(KERN_ERR "NFS: %s: unhandled error %d. "
-                                                __func__, status);
+                                        "Zeroing state\n", __func__, status);
                        case -ENOMEM:
                        case -NFS4ERR_DENIED:
                        case -NFS4ERR_RECLAIM_BAD:
@@ -1222,8 +1266,9 @@ restart:
                                spin_lock(&state->state_lock);
                                list_for_each_entry(lock, &state->lock_states, ls_locks) {
                                        if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
-                                                printk("%s: Lock reclaim failed!\n",
+                                                pr_warn_ratelimited("NFS: "
-                                                        __func__);
+                                                        "%s: Lock reclaim "
+                                                        "failed!\n", __func__);
                                }
                                spin_unlock(&state->state_lock);
                                nfs4_put_open_state(state);
@@ -1232,8 +1277,8 @@ restart:
                }
                switch (status) {
                        default:
-                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
+                                printk(KERN_ERR "NFS: %s: unhandled error %d. "
-                                                __func__, status);
+                                        "Zeroing state\n", __func__, status);
                        case -ENOENT:
                        case -ENOMEM:
                        case -ESTALE:
@@ -1241,8 +1286,8 @@ restart:
                                 * Open state on this file cannot be recovered
                                 * All we can do is revert to using the zero stateid.
                                 */
-                                memset(state->stateid.data, 0,
+                                memset(&state->stateid, 0,
-                                        sizeof(state->stateid.data));
+                                        sizeof(state->stateid));
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
@@ -1420,7 +1465,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case 0:
                        break;
                case -NFS4ERR_CB_PATH_DOWN:
-                        nfs_handle_cb_pathdown(clp);
+                        nfs40_handle_cb_pathdown(clp);
                        break;
                case -NFS4ERR_NO_GRACE:
                        nfs4_state_end_reclaim_reboot(clp);
@@ -1801,7 +1846,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
        } while (atomic_read(&clp->cl_count) > 1);
        return;
 out_error:
-        printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
+        pr_warn_ratelimited("NFS: state manager failed on NFSv4 server %s"
                        " with error %d\n", clp->cl_hostname, -status);
        nfs4_end_drain_session(clp);
        nfs4_clear_state_manager_bit(clp);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 33bd8d0f745d..c74fdb114b48 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -44,6 +44,8 @@
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/gss_api.h>
@@ -271,7 +273,12 @@ static int nfs4_stat_to_errno(int);
                                1 /* flags */ + \
                                1 /* spa_how */ + \
                                0 /* SP4_NONE (for now) */ + \
-                                1 /* zero implemetation id array */)
+                                1 /* implementation id array of size 1 */ + \
+                                1 /* nii_domain */ + \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+                                1 /* nii_name */ + \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+                                3 /* nii_date */)
 #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
                                2 /* eir_clientid */ + \
                                1 /* eir_sequenceid */ + \
@@ -284,7 +291,11 @@ static int nfs4_stat_to_errno(int);
                                /* eir_server_scope<> */ \
                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
                                1 /* eir_server_impl_id array length */ + \
-                                0 /* ignored eir_server_impl_id contents */)
+                                1 /* nii_domain */ + \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+                                1 /* nii_name */ + \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+                                3 /* nii_date */)
 #define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)
 #define decode_channel_attrs_maxsz  (6 + \
                                     1 /* ca_rdma_ird.len */ + \
@@ -838,6 +849,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                    XDR_UNIT);
 #endif /* CONFIG_NFS_V4_1 */
+static unsigned short send_implementation_id = 1;
+module_param(send_implementation_id, ushort, 0644);
+MODULE_PARM_DESC(send_implementation_id,
+                "Send implementation ID with NFSv4.1 exchange_id");
 static const umode_t nfs_type2fmt[] = {
        [NF4BAD] = 0,
        [NF4REG] = S_IFREG,
@@ -868,15 +885,44 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
        return p;
 }
+static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, len);
+        xdr_encode_opaque_fixed(p, buf, len);
+}
 static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
        __be32 *p;
-        p = xdr_reserve_space(xdr, 4 + len);
+        p = reserve_space(xdr, 4 + len);
-        BUG_ON(p == NULL);
        xdr_encode_opaque(p, str, len);
 }
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 4);
+        *p = cpu_to_be32(n);
+}
+static void encode_uint64(struct xdr_stream *xdr, u64 n)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 8);
+        xdr_encode_hyper(p, n);
+}
+static void encode_nfs4_seqid(struct xdr_stream *xdr,
+                const struct nfs_seqid *seqid)
+{
+        encode_uint32(xdr, seqid->sequence->counter);
+}
 static void encode_compound_hdr(struct xdr_stream *xdr,
                                struct rpc_rqst *req,
                                struct compound_hdr *hdr)
@@ -889,28 +935,37 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
         * but this is not required as a MUST for the server to do so. */
        hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
-        dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
        BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-        p = reserve_space(xdr, 4 + hdr->taglen + 8);
+        encode_string(xdr, hdr->taglen, hdr->tag);
-        p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
+        p = reserve_space(xdr, 8);
        *p++ = cpu_to_be32(hdr->minorversion);
        hdr->nops_p = p;
        *p = cpu_to_be32(hdr->nops);
 }
+static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
+                uint32_t replen,
+                struct compound_hdr *hdr)
+{
+        encode_uint32(xdr, op);
+        hdr->nops++;
+        hdr->replen += replen;
+}
 static void encode_nops(struct compound_hdr *hdr)
 {
        BUG_ON(hdr->nops > NFS4_MAX_OPS);
        *hdr->nops_p = htonl(hdr->nops);
 }
-static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
+static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)
 {
-        __be32 *p;
+        encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
-        p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
-        BUG_ON(p == NULL);
+{
-        xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
+        encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
 }
 static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
@@ -1023,7 +1078,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
         * Now we backfill the bitmap and the attribute buffer length.
         */
        if (len != ((char *)p - (char *)q) + 4) {
-                printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n",
+                printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n",
                                len, ((char *)p - (char *)q) + 4);
                BUG();
        }
@@ -1037,46 +1092,33 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr);
+        encode_uint32(xdr, access);
-        p = reserve_space(xdr, 8);
-        *p++ = cpu_to_be32(OP_ACCESS);
-        *p = cpu_to_be32(access);
-        hdr->nops++;
-        hdr->replen += decode_access_maxsz;
 }
 static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
+        encode_nfs4_seqid(xdr, arg->seqid);
-        p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
+        encode_nfs4_stateid(xdr, arg->stateid);
-        *p++ = cpu_to_be32(OP_CLOSE);
-        *p++ = cpu_to_be32(arg->seqid->sequence->counter);
-        xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-        hdr->nops++;
-        hdr->replen += decode_close_maxsz;
 }
 static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        p = reserve_space(xdr, 16);
+        encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_COMMIT);
+        p = reserve_space(xdr, 12);
        p = xdr_encode_hyper(p, args->offset);
        *p = cpu_to_be32(args->count);
-        hdr->nops++;
-        hdr->replen += decode_commit_maxsz;
 }
 static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
 {
        __be32 *p;
-        p = reserve_space(xdr, 8);
+        encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_CREATE);
+        encode_uint32(xdr, create->ftype);
-        *p = cpu_to_be32(create->ftype);
        switch (create->ftype) {
        case NF4LNK:
@@ -1096,9 +1138,6 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
        }
        encode_string(xdr, create->name->len, create->name->name);
-        hdr->nops++;
-        hdr->replen += decode_create_maxsz;
        encode_attrs(xdr, create->attrs, create->server);
 }
@@ -1106,25 +1145,21 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 {
        __be32 *p;
-        p = reserve_space(xdr, 12);
+        encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_GETATTR);
+        p = reserve_space(xdr, 8);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(bitmap);
-        hdr->nops++;
-        hdr->replen += decode_getattr_maxsz;
 }
 static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
 {
        __be32 *p;
-        p = reserve_space(xdr, 16);
+        encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_GETATTR);
+        p = reserve_space(xdr, 12);
        *p++ = cpu_to_be32(2);
        *p++ = cpu_to_be32(bm0);
        *p = cpu_to_be32(bm1);
-        hdr->nops++;
-        hdr->replen += decode_getattr_maxsz;
 }
 static void
@@ -1134,8 +1169,7 @@ encode_getattr_three(struct xdr_stream *xdr,
 {
        __be32 *p;
-        p = reserve_space(xdr, 4);
+        encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
-        *p = cpu_to_be32(OP_GETATTR);
        if (bm2) {
                p = reserve_space(xdr, 16);
                *p++ = cpu_to_be32(3);
@@ -1152,8 +1186,6 @@ encode_getattr_three(struct xdr_stream *xdr,
                *p++ = cpu_to_be32(1);
                *p = cpu_to_be32(bm0);
        }
-        hdr->nops++;
-        hdr->replen += decode_getattr_maxsz;
 }
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1179,23 +1211,13 @@ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, stru
 static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);
-        p = reserve_space(xdr, 4);
-        *p = cpu_to_be32(OP_GETFH);
-        hdr->nops++;
-        hdr->replen += decode_getfh_maxsz;
 }
 static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr);
+        encode_string(xdr, name->len, name->name);
-        p = reserve_space(xdr, 8 + name->len);
-        *p++ = cpu_to_be32(OP_LINK);
-        xdr_encode_opaque(p, name->name, name->len);
-        hdr->nops++;
-        hdr->replen += decode_link_maxsz;
 }
 static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -1232,79 +1254,60 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 {
        __be32 *p;
-        p = reserve_space(xdr, 32);
+        encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_LOCK);
+        p = reserve_space(xdr, 28);
        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
        *p++ = cpu_to_be32(args->reclaim);
        p = xdr_encode_hyper(p, args->fl->fl_start);
        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
        *p = cpu_to_be32(args->new_lock_owner);
        if (args->new_lock_owner){
-                p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+                encode_nfs4_seqid(xdr, args->open_seqid);
-                *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
+                encode_nfs4_stateid(xdr, args->open_stateid);
-                p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
+                encode_nfs4_seqid(xdr, args->lock_seqid);
-                *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
                encode_lockowner(xdr, &args->lock_owner);
        }
        else {
-                p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
+                encode_nfs4_stateid(xdr, args->lock_stateid);
-                p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
+                encode_nfs4_seqid(xdr, args->lock_seqid);
-                *p = cpu_to_be32(args->lock_seqid->sequence->counter);
        }
-        hdr->nops++;
-        hdr->replen += decode_lock_maxsz;
 }
 static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        p = reserve_space(xdr, 24);
+        encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_LOCKT);
+        p = reserve_space(xdr, 20);
        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
        p = xdr_encode_hyper(p, args->fl->fl_start);
        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
        encode_lockowner(xdr, &args->lock_owner);
-        hdr->nops++;
-        hdr->replen += decode_lockt_maxsz;
 }
 static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
+        encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_LOCKU);
+        encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
-        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+        encode_nfs4_seqid(xdr, args->seqid);
-        *p++ = cpu_to_be32(args->seqid->sequence->counter);
+        encode_nfs4_stateid(xdr, args->stateid);
-        p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->fl->fl_start);
        xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-        hdr->nops++;
-        hdr->replen += decode_locku_maxsz;
 }
 static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);
-        p = reserve_space(xdr, 4);
-        *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
        encode_lockowner(xdr, lowner);
-        hdr->nops++;
-        hdr->replen += decode_release_lockowner_maxsz;
 }
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
-        int len = name->len;
+        encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr);
-        __be32 *p;
+        encode_string(xdr, name->len, name->name);
-        p = reserve_space(xdr, 8 + len);
-        *p++ = cpu_to_be32(OP_LOOKUP);
-        xdr_encode_opaque(p, name->name, len);
-        hdr->nops++;
-        hdr->replen += decode_lookup_maxsz;
 }
 static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
@@ -1335,9 +1338,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
 * owner 4 = 32
 */
-        p = reserve_space(xdr, 8);
+        encode_nfs4_seqid(xdr, arg->seqid);
-        *p++ = cpu_to_be32(OP_OPEN);
-        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, arg->clientid);
@@ -1437,14 +1438,15 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 {
        __be32 *p;
-        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 4);
-        *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+        *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-        xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+        encode_nfs4_stateid(xdr, stateid);
        encode_string(xdr, name->len, name->name);
 }
 static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
 {
+        encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
        encode_openhdr(xdr, arg);
        encode_opentype(xdr, arg);
        switch (arg->claim) {
@@ -1460,88 +1462,64 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
        default:
                BUG();
        }
-        hdr->nops++;
-        hdr->replen += decode_open_maxsz;
 }
 static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr);
+        encode_nfs4_stateid(xdr, arg->stateid);
-        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+        encode_nfs4_seqid(xdr, arg->seqid);
-        *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
-        p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-        *p = cpu_to_be32(arg->seqid->sequence->counter);
-        hdr->nops++;
-        hdr->replen += decode_open_confirm_maxsz;
 }
 static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
+        encode_nfs4_stateid(xdr, arg->stateid);
-        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+        encode_nfs4_seqid(xdr, arg->seqid);
-        *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
-        p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
-        hdr->nops++;
-        hdr->replen += decode_open_downgrade_maxsz;
 }
 static void
 encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
 {
-        int len = fh->size;
+        encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr);
-        __be32 *p;
+        encode_string(xdr, fh->size, fh->data);
-        p = reserve_space(xdr, 8 + len);
-        *p++ = cpu_to_be32(OP_PUTFH);
-        xdr_encode_opaque(p, fh->data, len);
-        hdr->nops++;
-        hdr->replen += decode_putfh_maxsz;
 }
 static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
-        p = reserve_space(xdr, 4);
-        *p = cpu_to_be32(OP_PUTROOTFH);
-        hdr->nops++;
-        hdr->replen += decode_putrootfh_maxsz;
 }
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
+static void encode_open_stateid(struct xdr_stream *xdr,
+                const struct nfs_open_context *ctx,
+                const struct nfs_lock_context *l_ctx,
+                fmode_t fmode,
+                int zero_seqid)
 {
        nfs4_stateid stateid;
-        __be32 *p;
-        p = reserve_space(xdr, NFS4_STATEID_SIZE);
        if (ctx->state != NULL) {
-                nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+                nfs4_select_rw_stateid(&stateid, ctx->state,
+                                fmode, l_ctx->lockowner, l_ctx->pid);
                if (zero_seqid)
-                        stateid.stateid.seqid = 0;
+                        stateid.seqid = 0;
-                xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
+                encode_nfs4_stateid(xdr, &stateid);
        } else
-                xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+                encode_nfs4_stateid(xdr, &zero_stateid);
 }
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        p = reserve_space(xdr, 4);
+        encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
-        *p = cpu_to_be32(OP_READ);
+        encode_open_stateid(xdr, args->context, args->lock_context,
+                        FMODE_READ, hdr->minorversion);
-        encode_stateid(xdr, args->context, args->lock_context,
-                       hdr->minorversion);
        p = reserve_space(xdr, 12);
        p = xdr_encode_hyper(p, args->offset);
        *p = cpu_to_be32(args->count);
-        hdr->nops++;
-        hdr->replen += decode_read_maxsz;
 }
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1551,7 +1529,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                FATTR4_WORD1_MOUNTED_ON_FILEID,
        };
        uint32_t dircount = readdir->count >> 1;
-        __be32 *p;
+        __be32 *p, verf[2];
        if (readdir->plus) {
                attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
@@ -1566,80 +1544,54 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
        if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
                attrs[0] |= FATTR4_WORD0_FILEID;
-        p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
+        encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_READDIR);
+        encode_uint64(xdr, readdir->cookie);
-        p = xdr_encode_hyper(p, readdir->cookie);
+        encode_nfs4_verifier(xdr, &readdir->verifier);
-        p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
+        p = reserve_space(xdr, 20);
        *p++ = cpu_to_be32(dircount);
        *p++ = cpu_to_be32(readdir->count);
        *p++ = cpu_to_be32(2);
        *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
        *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
-        hdr->nops++;
+        memcpy(verf, readdir->verifier.data, sizeof(verf));
-        hdr->replen += decode_readdir_maxsz;
        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
-                        ((u32 *)readdir->verifier.data)[0],
+                        verf[0], verf[1],
-                        ((u32 *)readdir->verifier.data)[1],
                        attrs[0] & readdir->bitmask[0],
                        attrs[1] & readdir->bitmask[1]);
 }
 static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);
-        p = reserve_space(xdr, 4);
-        *p = cpu_to_be32(OP_READLINK);
-        hdr->nops++;
-        hdr->replen += decode_readlink_maxsz;
 }
 static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr);
+        encode_string(xdr, name->len, name->name);
-        p = reserve_space(xdr, 8 + name->len);
-        *p++ = cpu_to_be32(OP_REMOVE);
-        xdr_encode_opaque(p, name->name, name->len);
-        hdr->nops++;
-        hdr->replen += decode_remove_maxsz;
 }
 static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);
-        p = reserve_space(xdr, 4);
-        *p = cpu_to_be32(OP_RENAME);
        encode_string(xdr, oldname->len, oldname->name);
        encode_string(xdr, newname->len, newname->name);
-        hdr->nops++;
-        hdr->replen += decode_rename_maxsz;
 }
-static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
+static void encode_renew(struct xdr_stream *xdr, clientid4 clid,
+                         struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr);
+        encode_uint64(xdr, clid);
-        p = reserve_space(xdr, 12);
-        *p++ = cpu_to_be32(OP_RENEW);
-        xdr_encode_hyper(p, client_stateid->cl_clientid);
-        hdr->nops++;
-        hdr->replen += decode_renew_maxsz;
 }
 static void
 encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
-        p = reserve_space(xdr, 4);
-        *p = cpu_to_be32(OP_RESTOREFH);
-        hdr->nops++;
-        hdr->replen += decode_restorefh_maxsz;
 }
 static void
@@ -1647,9 +1599,8 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 {
        __be32 *p;
-        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+        encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_SETATTR);
+        encode_nfs4_stateid(xdr, &zero_stateid);
-        xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
        p = reserve_space(xdr, 2*4);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(FATTR4_WORD0_ACL);
@@ -1657,30 +1608,18 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
-        hdr->nops++;
-        hdr->replen += decode_setacl_maxsz;
 }
 static void
 encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);
-        p = reserve_space(xdr, 4);
-        *p = cpu_to_be32(OP_SAVEFH);
-        hdr->nops++;
-        hdr->replen += decode_savefh_maxsz;
 }
 static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
+        encode_nfs4_stateid(xdr, &arg->stateid);
-        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
-        *p++ = cpu_to_be32(OP_SETATTR);
-        xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
-        hdr->nops++;
-        hdr->replen += decode_setattr_maxsz;
        encode_attrs(xdr, arg->iap, server);
 }
@@ -1688,9 +1627,8 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 {
        __be32 *p;
-        p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
+        encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_SETCLIENTID);
+        encode_nfs4_verifier(xdr, setclientid->sc_verifier);
-        xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
        encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
        p = reserve_space(xdr, 4);
@@ -1699,31 +1637,23 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(setclientid->sc_cb_ident);
-        hdr->nops++;
-        hdr->replen += decode_setclientid_maxsz;
 }
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM,
+                        decode_setclientid_confirm_maxsz, hdr);
-        p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
+        encode_uint64(xdr, arg->clientid);
-        *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
+        encode_nfs4_verifier(xdr, &arg->confirm);
-        p = xdr_encode_hyper(p, arg->clientid);
-        xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
-        hdr->nops++;
-        hdr->replen += decode_setclientid_confirm_maxsz;
 }
 static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        p = reserve_space(xdr, 4);
+        encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
-        *p = cpu_to_be32(OP_WRITE);
+        encode_open_stateid(xdr, args->context, args->lock_context,
+                        FMODE_WRITE, hdr->minorversion);
-        encode_stateid(xdr, args->context, args->lock_context,
-                       hdr->minorversion);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->offset);
@@ -1731,32 +1661,18 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        *p = cpu_to_be32(args->count);
        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
-        hdr->nops++;
-        hdr->replen += decode_write_maxsz;
 }
 static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr);
+        encode_nfs4_stateid(xdr, stateid);
-        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
-        *p++ = cpu_to_be32(OP_DELEGRETURN);
-        xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
-        hdr->nops++;
-        hdr->replen += decode_delegreturn_maxsz;
 }
 static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
-        int len = name->len;
+        encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr);
-        __be32 *p;
+        encode_string(xdr, name->len, name->name);
-        p = reserve_space(xdr, 8 + len);
-        *p++ = cpu_to_be32(OP_SECINFO);
-        xdr_encode_opaque(p, name->name, len);
-        hdr->nops++;
-        hdr->replen += decode_secinfo_maxsz;
 }
 #if defined(CONFIG_NFS_V4_1)
@@ -1766,19 +1682,39 @@ static void encode_exchange_id(struct xdr_stream *xdr,
                               struct compound_hdr *hdr)
 {
        __be32 *p;
+        char impl_name[NFS4_OPAQUE_LIMIT];
+        int len = 0;
-        p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
+        encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_EXCHANGE_ID);
+        encode_nfs4_verifier(xdr, args->verifier);
-        xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
        encode_string(xdr, args->id_len, args->id);
        p = reserve_space(xdr, 12);
        *p++ = cpu_to_be32(args->flags);
        *p++ = cpu_to_be32(0);  /* zero length state_protect4_a */
-        *p = cpu_to_be32(0);    /* zero length implementation id array */
-        hdr->nops++;
+        if (send_implementation_id &&
-        hdr->replen += decode_exchange_id_maxsz;
+            sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
+            sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
+                <= NFS4_OPAQUE_LIMIT + 1)
+                len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
+                               utsname()->sysname, utsname()->release,
+                               utsname()->version, utsname()->machine);
+        if (len > 0) {
+                *p = cpu_to_be32(1);    /* implementation id array length=1 */
+                encode_string(xdr,
+                        sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
+                        CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN);
+                encode_string(xdr, len, impl_name);
+                /* just send zeros for nii_date - the date is in nii_name */
+                p = reserve_space(xdr, 12);
+                p = xdr_encode_hyper(p, 0);
+                *p = cpu_to_be32(0);
+        } else
+                *p = cpu_to_be32(0);    /* implementation id array length=0 */
 }
 static void encode_create_session(struct xdr_stream *xdr,
@@ -1801,8 +1737,8 @@ static void encode_create_session(struct xdr_stream *xdr,
        len = scnprintf(machine_name, sizeof(machine_name), "%s",
                        clp->cl_ipaddr);
-        p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
+        encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_CREATE_SESSION);
+        p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12);
        p = xdr_encode_hyper(p, clp->cl_clientid);
        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
        *p++ = cpu_to_be32(args->flags);                        /*flags */
@@ -1835,33 +1771,22 @@ static void encode_create_session(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(0);                          /* UID */
        *p++ = cpu_to_be32(0);                          /* GID */
        *p = cpu_to_be32(0);                            /* No more gids */
-        hdr->nops++;
-        hdr->replen += decode_create_session_maxsz;
 }
 static void encode_destroy_session(struct xdr_stream *xdr,
                                   struct nfs4_session *session,
                                   struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr);
-        p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
+        encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
-        *p++ = cpu_to_be32(OP_DESTROY_SESSION);
-        xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
-        hdr->nops++;
-        hdr->replen += decode_destroy_session_maxsz;
 }
 static void encode_reclaim_complete(struct xdr_stream *xdr,
                                    struct nfs41_reclaim_complete_args *args,
                                    struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr);
+        encode_uint32(xdr, args->one_fs);
-        p = reserve_space(xdr, 8);
-        *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE);
-        *p++ = cpu_to_be32(args->one_fs);
-        hdr->nops++;
-        hdr->replen += decode_reclaim_complete_maxsz;
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -1883,8 +1808,7 @@ static void encode_sequence(struct xdr_stream *xdr,
        WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
        slot = tp->slots + args->sa_slotid;
-        p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
+        encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_SEQUENCE);
        /*
         * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1898,13 +1822,12 @@ static void encode_sequence(struct xdr_stream *xdr,
                ((u32 *)session->sess_id.data)[3],
                slot->seq_nr, args->sa_slotid,
                tp->highest_used_slotid, args->sa_cache_this);
+        p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
        p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
        *p++ = cpu_to_be32(slot->seq_nr);
        *p++ = cpu_to_be32(args->sa_slotid);
        *p++ = cpu_to_be32(tp->highest_used_slotid);
        *p = cpu_to_be32(args->sa_cache_this);
-        hdr->nops++;
-        hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
 }
@@ -1919,14 +1842,12 @@ encode_getdevicelist(struct xdr_stream *xdr,
                .data = "dummmmmy",
        };
-        p = reserve_space(xdr, 20);
+        encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_GETDEVICELIST);
+        p = reserve_space(xdr, 16);
        *p++ = cpu_to_be32(args->layoutclass);
        *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
        xdr_encode_hyper(p, 0ULL);                          /* cookie */
        encode_nfs4_verifier(xdr, &dummy);
-        hdr->nops++;
-        hdr->replen += decode_getdevicelist_maxsz;
 }
 static void
@@ -1936,15 +1857,13 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
 {
        __be32 *p;
-        p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+        encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_GETDEVICEINFO);
+        p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
        p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
                                    NFS4_DEVICEID4_SIZE);
        *p++ = cpu_to_be32(args->pdev->layout_type);
        *p++ = cpu_to_be32(args->pdev->pglen);          /* gdia_maxcount */
        *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
-        hdr->nops++;
-        hdr->replen += decode_getdeviceinfo_maxsz;
 }
 static void
@@ -1954,16 +1873,16 @@ encode_layoutget(struct xdr_stream *xdr,
 {
        __be32 *p;
-        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+        encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_LAYOUTGET);
+        p = reserve_space(xdr, 36);
        *p++ = cpu_to_be32(0);     /* Signal layout available */
        *p++ = cpu_to_be32(args->type);
        *p++ = cpu_to_be32(args->range.iomode);
        p = xdr_encode_hyper(p, args->range.offset);
        p = xdr_encode_hyper(p, args->range.length);
        p = xdr_encode_hyper(p, args->minlength);
-        p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+        encode_nfs4_stateid(xdr, &args->stateid);
-        *p = cpu_to_be32(args->maxcount);
+        encode_uint32(xdr, args->maxcount);
        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
                __func__,
@@ -1972,8 +1891,6 @@ encode_layoutget(struct xdr_stream *xdr,
                (unsigned long)args->range.offset,
                (unsigned long)args->range.length,
                args->maxcount);
-        hdr->nops++;
-        hdr->replen += decode_layoutget_maxsz;
 }
 static int
@@ -1987,13 +1904,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
        dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
                NFS_SERVER(args->inode)->pnfs_curr_ld->id);
-        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+        encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
+        p = reserve_space(xdr, 20);
        /* Only whole file layouts */
        p = xdr_encode_hyper(p, 0); /* offset */
        p = xdr_encode_hyper(p, args->lastbytewritten + 1);     /* length */
-        *p++ = cpu_to_be32(0); /* reclaim */
+        *p = cpu_to_be32(0); /* reclaim */
-        p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
+        encode_nfs4_stateid(xdr, &args->stateid);
+        p = reserve_space(xdr, 20);
        *p++ = cpu_to_be32(1); /* newoffset = TRUE */
        p = xdr_encode_hyper(p, args->lastbytewritten);
        *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
@@ -2002,13 +1920,9 @@ encode_layoutcommit(struct xdr_stream *xdr,
        if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
                NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
                        NFS_I(inode)->layout, xdr, args);
-        else {
+        else
-                p = reserve_space(xdr, 4);
+                encode_uint32(xdr, 0); /* no layout-type payload */
-                *p = cpu_to_be32(0); /* no layout-type payload */
-        }
-        hdr->nops++;
-        hdr->replen += decode_layoutcommit_maxsz;
        return 0;
 }
@@ -2019,27 +1933,23 @@ encode_layoutreturn(struct xdr_stream *xdr,
 {
        __be32 *p;
-        p = reserve_space(xdr, 20);
+        encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
-        *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+        p = reserve_space(xdr, 16);
        *p++ = cpu_to_be32(0);          /* reclaim. always 0 for now */
        *p++ = cpu_to_be32(args->layout_type);
        *p++ = cpu_to_be32(IOMODE_ANY);
        *p = cpu_to_be32(RETURN_FILE);
-        p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, 0);
        p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
        spin_lock(&args->inode->i_lock);
-        xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+        encode_nfs4_stateid(xdr, &args->stateid);
        spin_unlock(&args->inode->i_lock);
        if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
                NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
                        NFS_I(args->inode)->layout, xdr, args);
-        } else {
+        } else
-                p = reserve_space(xdr, 4);
+                encode_uint32(xdr, 0);
-                *p = cpu_to_be32(0);
-        }
-        hdr->nops++;
-        hdr->replen += decode_layoutreturn_maxsz;
 }
 static int
@@ -2047,12 +1957,8 @@ encode_secinfo_no_name(struct xdr_stream *xdr,
                       const struct nfs41_secinfo_no_name_args *args,
                       struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr);
-        p = reserve_space(xdr, 8);
+        encode_uint32(xdr, args->style);
-        *p++ = cpu_to_be32(OP_SECINFO_NO_NAME);
-        *p++ = cpu_to_be32(args->style);
-        hdr->nops++;
-        hdr->replen += decode_secinfo_no_name_maxsz;
        return 0;
 }
@@ -2060,26 +1966,17 @@ static void encode_test_stateid(struct xdr_stream *xdr,
                                struct nfs41_test_stateid_args *args,
                                struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr);
+        encode_uint32(xdr, 1);
-        p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE);
+        encode_nfs4_stateid(xdr, args->stateid);
-        *p++ = cpu_to_be32(OP_TEST_STATEID);
-        *p++ = cpu_to_be32(1);
-        xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
-        hdr->nops++;
-        hdr->replen += decode_test_stateid_maxsz;
 }
 static void encode_free_stateid(struct xdr_stream *xdr,
                                struct nfs41_free_stateid_args *args,
                                struct compound_hdr *hdr)
 {
-        __be32 *p;
+        encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
-        p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE);
+        encode_nfs4_stateid(xdr, args->stateid);
-        *p++ = cpu_to_be32(OP_FREE_STATEID);
-        xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
-        hdr->nops++;
-        hdr->replen += decode_free_stateid_maxsz;
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -2633,6 +2530,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, args->fhandle, &hdr);
        encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+                           FATTR4_WORD0_FH_EXPIRE_TYPE|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
                           FATTR4_WORD0_ACLSUPPORT, &hdr);
@@ -2650,7 +2548,7 @@ static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
        };
        encode_compound_hdr(xdr, req, &hdr);
-        encode_renew(xdr, clp, &hdr);
+        encode_renew(xdr, clp->cl_clientid, &hdr);
        encode_nops(&hdr);
 }
@@ -3180,6 +3078,28 @@ out_overflow:
        return -EIO;
 }
+static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
+                                      uint32_t *bitmap, uint32_t *type)
+{
+        __be32 *p;
+        *type = 0;
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U)))
+                return -EIO;
+        if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *type = be32_to_cpup(p);
+                bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
+        }
+        dprintk("%s: expire type=0x%x\n", __func__, *type);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
        __be32 *p;
@@ -3513,16 +3433,17 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
        n = be32_to_cpup(p);
        if (n == 0)
                goto root_path;
-        dprintk("path ");
+        dprintk("pathname4: ");
        path->ncomponents = 0;
        while (path->ncomponents < n) {
                struct nfs4_string *component = &path->components[path->ncomponents];
                status = decode_opaque_inline(xdr, &component->len, &component->data);
                if (unlikely(status != 0))
                        goto out_eio;
-                if (path->ncomponents != n)
+                ifdebug (XDR)
-                        dprintk("/");
+                        pr_cont("%s%.*s ",
-                dprintk("%s", component->data);
+                                (path->ncomponents != n ? "/ " : ""),
+                                component->len, component->data);
                if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
                        path->ncomponents++;
                else {
@@ -3531,14 +3452,13 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
                }
        }
 out:
-        dprintk("\n");
        return status;
 root_path:
 /* a root pathname is sent as a zero component4 */
        path->ncomponents = 1;
        path->components[0].len=0;
        path->components[0].data=NULL;
-        dprintk("path /\n");
+        dprintk("pathname4: /\n");
        goto out;
 out_eio:
        dprintk(" status %d", status);
@@ -3560,7 +3480,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
        status = 0;
        if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
                goto out;
-        dprintk("%s: fsroot ", __func__);
+        status = -EIO;
+        /* Ignore borken servers that return unrequested attrs */
+        if (unlikely(res == NULL))
+                goto out;
+        dprintk("%s: fsroot:\n", __func__);
        status = decode_pathname(xdr, &res->fs_path);
        if (unlikely(status != 0))
                goto out;
@@ -3581,7 +3505,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                m = be32_to_cpup(p);
                loc->nservers = 0;
-                dprintk("%s: servers ", __func__);
+                dprintk("%s: servers:\n", __func__);
                while (loc->nservers < m) {
                        struct nfs4_string *server = &loc->servers[loc->nservers];
                        status = decode_opaque_inline(xdr, &server->len, &server->data);
@@ -3613,7 +3537,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                        res->nlocations++;
        }
        if (res->nlocations != 0)
-                status = NFS_ATTR_FATTR_V4_REFERRAL;
+                status = NFS_ATTR_FATTR_V4_LOCATIONS;
 out:
        dprintk("%s: fs_locations done, error = %d\n", __func__, status);
        return status;
@@ -4157,7 +4081,7 @@ static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
 static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
-        return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+        return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
 }
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -4174,7 +4098,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 static int decode_verifier(struct xdr_stream *xdr, void *verifier)
 {
-        return decode_opaque_fixed(xdr, verifier, 8);
+        return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
 }
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
@@ -4224,6 +4148,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
                goto xdr_error;
        if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)
                goto xdr_error;
+        if ((status = decode_attr_fh_expire_type(xdr, bitmap,
+                                                 &res->fh_expire_type)) != 0)
+                goto xdr_error;
        if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)
                goto xdr_error;
        if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0)
@@ -4294,6 +4221,7 @@ xdr_error:
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                struct nfs_fattr *fattr, struct nfs_fh *fh,
+                struct nfs4_fs_locations *fs_loc,
                const struct nfs_server *server)
 {
        int status;
@@ -4341,9 +4269,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+        status = decode_attr_fs_locations(xdr, bitmap, fs_loc);
-                                                struct nfs4_fs_locations,
-                                                fattr));
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
@@ -4407,7 +4333,8 @@ xdr_error:
 }
 static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
-                struct nfs_fh *fh, const struct nfs_server *server)
+                struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
+                const struct nfs_server *server)
 {
        __be32 *savep;
        uint32_t attrlen,
@@ -4426,7 +4353,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
        if (status < 0)
                goto xdr_error;
-        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
+        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
        if (status < 0)
                goto xdr_error;
@@ -4439,7 +4366,7 @@ xdr_error:
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                const struct nfs_server *server)
 {
-        return decode_getfattr_generic(xdr, fattr, NULL, server);
+        return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
 }
 /*
@@ -4463,8 +4390,8 @@ static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
                return 0;
        }
        if (num > 1)
-                printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+                printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
-                        "per filesystem not supported\n", __func__);
+                        "drivers per filesystem not supported\n", __func__);
        /* Decode and set first layout type, move xdr->p past unused types */
        p = xdr_inline_decode(xdr, num * 4);
@@ -4863,17 +4790,16 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        size_t          hdrlen;
        u32             recvd, pglen = rcvbuf->page_len;
        int             status;
+        __be32          verf[2];
        status = decode_op_hdr(xdr, OP_READDIR);
        if (!status)
                status = decode_verifier(xdr, readdir->verifier.data);
        if (unlikely(status))
                return status;
+        memcpy(verf, readdir->verifier.data, sizeof(verf));
        dprintk("%s: verifier = %08x:%08x\n",
-                        __func__,
+                        __func__, verf[0], verf[1]);
-                        ((u32 *)readdir->verifier.data)[0],
-                        ((u32 *)readdir->verifier.data)[1]);
        hdrlen = (char *) xdr->p - (char *) iov->iov_base;
        recvd = rcvbuf->len - hdrlen;
@@ -5120,7 +5046,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
                goto out_overflow;
        res->count = be32_to_cpup(p++);
        res->verf->committed = be32_to_cpup(p++);
-        memcpy(res->verf->verifier, p, 8);
+        memcpy(res->verf->verifier, p, NFS4_VERIFIER_SIZE);
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -5214,6 +5140,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        char *dummy_str;
        int status;
        struct nfs_client *clp = res->client;
+        uint32_t impl_id_count;
        status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
        if (status)
@@ -5255,11 +5182,38 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        memcpy(res->server_scope->server_scope, dummy_str, dummy);
        res->server_scope->server_scope_sz = dummy;
-        /* Throw away Implementation id array */
+        /* Implementation Id */
-        status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(status))
+        if (unlikely(!p))
-                return status;
+                goto out_overflow;
+        impl_id_count = be32_to_cpup(p++);
+        if (impl_id_count) {
+                /* nii_domain */
+                status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+                if (unlikely(status))
+                        return status;
+                if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+                        return -EIO;
+                memcpy(res->impl_id->domain, dummy_str, dummy);
+                /* nii_name */
+                status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+                if (unlikely(status))
+                        return status;
+                if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+                        return -EIO;
+                memcpy(res->impl_id->name, dummy_str, dummy);
+                /* nii_date */
+                p = xdr_inline_decode(xdr, 12);
+                if (unlikely(!p))
+                        goto out_overflow;
+                p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
+                res->impl_id->date.nseconds = be32_to_cpup(p);
+                /* if there's more than one entry, ignore the rest */
+        }
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -5285,8 +5239,8 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
        attrs->max_reqs = be32_to_cpup(p++);
        nr_attrs = be32_to_cpup(p);
        if (unlikely(nr_attrs > 1)) {
-                printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
+                printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs "
-                        __func__, nr_attrs);
+                        "count %u\n", __func__, nr_attrs);
                return -EINVAL;
        }
        if (nr_attrs == 1) {
@@ -5436,14 +5390,14 @@ static int decode_getdevicelist(struct xdr_stream *xdr,
        p += 2;
        /* Read verifier */
-        p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+        p = xdr_decode_opaque_fixed(p, verftemp.verifier, NFS4_VERIFIER_SIZE);
        res->num_devs = be32_to_cpup(p);
        dprintk("%s: num_dev %d\n", __func__, res->num_devs);
        if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
-                printk(KERN_ERR "%s too many result dev_num %u\n",
+                printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
                                __func__, res->num_devs);
                return -EIO;
        }
@@ -5537,11 +5491,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
        status = decode_op_hdr(xdr, OP_LAYOUTGET);
        if (status)
                return status;
-        p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->return_on_close = be32_to_cpup(p);
+        decode_stateid(xdr, &res->stateid);
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
                goto out_overflow;
-        res->return_on_close = be32_to_cpup(p++);
-        p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
        layout_count = be32_to_cpup(p);
        if (!layout_count) {
                dprintk("%s: server responded with empty layout array\n",
@@ -5666,7 +5623,8 @@ static int decode_test_stateid(struct xdr_stream *xdr,
        if (unlikely(!p))
                goto out_overflow;
        res->status = be32_to_cpup(p++);
-        return res->status;
+        return status;
 out_overflow:
        print_overflow_msg(__func__, xdr);
 out:
@@ -6583,8 +6541,9 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
        if (status)
                goto out;
        xdr_enter_page(xdr, PAGE_SIZE);
-        status = decode_getfattr(xdr, &res->fs_locations->fattr,
+        status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
-                                 res->fs_locations->server);
+                                         NULL, res->fs_locations,
+                                         res->fs_locations->server);
 out:
        return status;
 }
@@ -6964,7 +6923,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                goto out_overflow;
        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
-                                        entry->server) < 0)
+                                  NULL, entry->server) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
                entry->ino = entry->fattr->mounted_on_fileid;
@@ -7112,7 +7071,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 #endif /* CONFIG_NFS_V4_1 */
 };
-struct rpc_version              nfs_version4 = {
+const struct rpc_version nfs_version4 = {
        .number                 = 4,
        .nrprocs                = ARRAY_SIZE(nfs4_procedures),
        .procs                  = nfs4_procedures
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c4744e1d513c..cd3c910d2d12 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -104,7 +104,7 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
 /* server:export path string passed to super.c */
 static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
-#ifdef RPC_DEBUG
+#ifdef NFS_DEBUG
 /*
 * When the "nfsrootdebug" kernel command line option is specified,
 * enable debugging messages for NFSROOT.
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 55d01280a609..4bff4a3dab46 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -137,6 +137,7 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
        struct objio_dev_ent *ode;
        struct osd_dev *od;
        struct osd_dev_info odi;
+        bool retry_flag = true;
        int err;
        ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
@@ -171,10 +172,18 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
                goto out;
        }
+retry_lookup:
        od = osduld_info_lookup(&odi);
        if (unlikely(IS_ERR(od))) {
                err = PTR_ERR(od);
                dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+                if (err == -ENODEV && retry_flag) {
+                        err = objlayout_autologin(deviceaddr);
+                        if (likely(!err)) {
+                                retry_flag = false;
+                                goto retry_lookup;
+                        }
+                }
                goto out;
        }
@@ -205,25 +214,36 @@ static void copy_single_comp(struct ore_components *oc, unsigned c,
 int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
                       struct objio_segment **pseg)
 {
-        struct __alloc_objio_segment {
+/*      This is the in memory structure of the objio_segment
-                struct objio_segment olseg;
+ *
-                struct ore_dev *ods[numdevs];
+ *      struct __alloc_objio_segment {
-                struct ore_comp comps[numdevs];
+ *              struct objio_segment olseg;
-        } *aolseg;
+ *              struct ore_dev *ods[numdevs];
+ *              struct ore_comp comps[numdevs];
-        aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
+ *      } *aolseg;
-        if (unlikely(!aolseg)) {
+ *      NOTE: The code as above compiles and runs perfectly. It is elegant,
+ *      type safe and compact. At some Past time Linus has decided he does not
+ *      like variable length arrays, For the sake of this principal we uglify
+ *      the code as below.
+ */
+        struct objio_segment *lseg;
+        size_t lseg_size = sizeof(*lseg) +
+                        numdevs * sizeof(lseg->oc.ods[0]) +
+                        numdevs * sizeof(*lseg->oc.comps);
+        lseg = kzalloc(lseg_size, gfp_flags);
+        if (unlikely(!lseg)) {
                dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
-                        numdevs, sizeof(*aolseg));
+                        numdevs, lseg_size);
                return -ENOMEM;
        }
-        aolseg->olseg.oc.numdevs = numdevs;
+        lseg->oc.numdevs = numdevs;
-        aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
+        lseg->oc.single_comp = EC_MULTPLE_COMPS;
-        aolseg->olseg.oc.comps = aolseg->comps;
+        lseg->oc.ods = (void *)(lseg + 1);
-        aolseg->olseg.oc.ods = aolseg->ods;
+        lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
-        *pseg = &aolseg->olseg;
+        *pseg = lseg;
        return 0;
 }
@@ -582,10 +602,10 @@ objlayout_init(void)
        if (ret)
                printk(KERN_INFO
-                        "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+                        "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
                        __func__, ret);
        else
-                printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+                printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
                        __func__);
        return ret;
 }
@@ -594,7 +614,7 @@ static void __exit
 objlayout_exit(void)
 {
        pnfs_unregister_layoutdriver(&objlayout_type);
-        printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+        printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
               __func__);
 }
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index b3c29039f5b8..8d45f1c318ce 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -37,6 +37,9 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#include <linux/kmod.h>
+#include <linux/moduleparam.h>
+#include <linux/ratelimit.h>
 #include <scsi/osd_initiator.h>
 #include "objlayout.h"
@@ -156,7 +159,7 @@ last_byte_offset(u64 start, u64 len)
        return end > start ? end - 1 : NFS4_MAX_UINT64;
 }
-void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
+static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
                           struct page ***p_pages, unsigned *p_pgbase,
                           u64 offset, unsigned long count)
 {
@@ -490,9 +493,9 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
                        if (!ioerr->oer_errno)
                                continue;
-                        printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+                        printk(KERN_ERR "NFS: %s: err[%d]: errno=%d "
-                                "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+                                "is_write=%d dev(%llx:%llx) par=0x%llx "
-                                "offset=0x%llx length=0x%llx\n",
+                                "obj=0x%llx offset=0x%llx length=0x%llx\n",
                                __func__, i, ioerr->oer_errno,
                                ioerr->oer_iswrite,
                                _DEVID_LO(&ioerr->oer_component.oid_device_id),
@@ -651,3 +654,134 @@ void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
        __free_page(odi->page);
        kfree(odi);
 }
+enum {
+        OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
+        OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
+        OSD_LOGIN_UPCALL_PATHLEN  = 256
+};
+static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login";
+module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog),
+                    0600);
+MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program");
+struct __auto_login {
+        char uri[OBJLAYOUT_MAX_URI_LEN];
+        char osdname[OBJLAYOUT_MAX_OSDNAME_LEN];
+        char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN];
+};
+static int __objlayout_upcall(struct __auto_login *login)
+{
+        static char *envp[] = { "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL
+        };
+        char *argv[8];
+        int ret;
+        if (unlikely(!osd_login_prog[0])) {
+                dprintk("%s: osd_login_prog is disabled\n", __func__);
+                return -EACCES;
+        }
+        dprintk("%s uri: %s\n", __func__, login->uri);
+        dprintk("%s osdname %s\n", __func__, login->osdname);
+        dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex);
+        argv[0] = (char *)osd_login_prog;
+        argv[1] = "-u";
+        argv[2] = login->uri;
+        argv[3] = "-o";
+        argv[4] = login->osdname;
+        argv[5] = "-s";
+        argv[6] = login->systemid_hex;
+        argv[7] = NULL;
+        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        /*
+         * Disable the upcall mechanism if we're getting an ENOENT or
+         * EACCES error. The admin can re-enable it on the fly by using
+         * sysfs to set the objlayoutdriver.osd_login_prog module parameter once
+         * the problem has been fixed.
+         */
+        if (ret == -ENOENT || ret == -EACCES) {
+                printk(KERN_ERR "PNFS-OBJ: %s was not found please set "
+                        "objlayoutdriver.osd_login_prog kernel parameter!\n",
+                        osd_login_prog);
+                osd_login_prog[0] = '\0';
+        }
+        dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret);
+        return ret;
+}
+/* Assume dest is all zeros */
+static void __copy_nfsS_and_zero_terminate(struct nfs4_string s,
+                                           char *dest, int max_len,
+                                           const char *var_name)
+{
+        if (!s.len)
+                return;
+        if (s.len >= max_len) {
+                pr_warn_ratelimited(
+                        "objlayout_autologin: %s: s.len(%d) >= max_len(%d)",
+                        var_name, s.len, max_len);
+                s.len = max_len - 1; /* space for null terminator */
+        }
+        memcpy(dest, s.data, s.len);
+}
+/* Assume sysid is all zeros */
+static void _sysid_2_hex(struct nfs4_string s,
+                  char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN])
+{
+        int i;
+        char *cur;
+        if (!s.len)
+                return;
+        if (s.len != OSD_SYSTEMID_LEN) {
+                pr_warn_ratelimited(
+                    "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN",
+                    s.len);
+                if (s.len > OSD_SYSTEMID_LEN)
+                        s.len = OSD_SYSTEMID_LEN;
+        }
+        cur = sysid;
+        for (i = 0; i < s.len; i++)
+                cur = hex_byte_pack(cur, s.data[i]);
+}
+int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+        int rc;
+        struct __auto_login login;
+        if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len)
+                return -ENODEV;
+        memset(&login, 0, sizeof(login));
+        __copy_nfsS_and_zero_terminate(
+                deviceaddr->oda_targetaddr.ota_netaddr.r_addr,
+                login.uri, sizeof(login.uri), "URI");
+        __copy_nfsS_and_zero_terminate(
+                deviceaddr->oda_osdname,
+                login.osdname, sizeof(login.osdname), "OSDNAME");
+        _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex);
+        rc = __objlayout_upcall(&login);
+        if (rc > 0) /* script returns positive values */
+                rc = -ENODEV;
+        return rc;
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 8ec34727ed21..880ba086be94 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -184,4 +184,6 @@ extern void objlayout_encode_layoutreturn(
        struct xdr_stream *,
        const struct nfs4_layoutreturn_args *);
+extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr);
 #endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 5668f7c54c41..d21fceaa9f62 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/sched.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_page.h>
@@ -106,36 +107,6 @@ void nfs_unlock_request(struct nfs_page *req)
        nfs_release_request(req);
 }
-/**
- * nfs_set_page_tag_locked - Tag a request as locked
- * @req:
- */
-int nfs_set_page_tag_locked(struct nfs_page *req)
-{
-        if (!nfs_lock_request_dontget(req))
-                return 0;
-        if (test_bit(PG_MAPPED, &req->wb_flags))
-                radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
-        return 1;
-}
-/**
- * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers
- */
-void nfs_clear_page_tag_locked(struct nfs_page *req)
-{
-        if (test_bit(PG_MAPPED, &req->wb_flags)) {
-                struct inode *inode = req->wb_context->dentry->d_inode;
-                struct nfs_inode *nfsi = NFS_I(inode);
-                spin_lock(&inode->i_lock);
-                radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
-                nfs_unlock_request(req);
-                spin_unlock(&inode->i_lock);
-        } else
-                nfs_unlock_request(req);
-}
 /*
 * nfs_clear_request - Free up all resources allocated to the request
 * @req:
@@ -425,67 +396,6 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
        }
 }
-#define NFS_SCAN_MAXENTRIES 16
-/**
- * nfs_scan_list - Scan a list for matching requests
- * @nfsi: NFS inode
- * @dst: Destination list
- * @idx_start: lower bound of page->index to scan
- * @npages: idx_start + npages sets the upper bound to scan.
- * @tag: tag to scan for
- *
- * Moves elements from one of the inode request lists.
- * If the number of requests is set to 0, the entire address_space
- * starting at index idx_start, is scanned.
- * The requests are *not* checked to ensure that they form a contiguous set.
- * You must be holding the inode's i_lock when calling this function
- */
-int nfs_scan_list(struct nfs_inode *nfsi,
-                struct list_head *dst, pgoff_t idx_start,
-                unsigned int npages, int tag)
-{
-        struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
-        struct nfs_page *req;
-        pgoff_t idx_end;
-        int found, i;
-        int res;
-        struct list_head *list;
-        res = 0;
-        if (npages == 0)
-                idx_end = ~0;
-        else
-                idx_end = idx_start + npages - 1;
-        for (;;) {
-                found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
-                                (void **)&pgvec[0], idx_start,
-                                NFS_SCAN_MAXENTRIES, tag);
-                if (found <= 0)
-                        break;
-                for (i = 0; i < found; i++) {
-                        req = pgvec[i];
-                        if (req->wb_index > idx_end)
-                                goto out;
-                        idx_start = req->wb_index + 1;
-                        if (nfs_set_page_tag_locked(req)) {
-                                kref_get(&req->wb_kref);
-                                radix_tree_tag_clear(&nfsi->nfs_page_tree,
-                                                req->wb_index, tag);
-                                list = pnfs_choose_commit_list(req, dst);
-                                nfs_list_add_request(req, list);
-                                res++;
-                                if (res == INT_MAX)
-                                        goto out;
-                        }
-                }
-                /* for latency reduction */
-                cond_resched_lock(&nfsi->vfs_inode.i_lock);
-        }
-out:
-        return res;
-}
 int __init nfs_init_nfspagecache(void)
 {
        nfs_page_cachep = kmem_cache_create("nfs_page",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 17149a490065..b5d451586943 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -101,8 +101,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
                goto out_no_driver;
        if (!(server->nfs_client->cl_exchange_flags &
                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
-                printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+                printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
-                       id, server->nfs_client->cl_exchange_flags);
+                        __func__, id, server->nfs_client->cl_exchange_flags);
                goto out_no_driver;
        }
        ld_type = find_pnfs_driver(id);
@@ -122,8 +122,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
        server->pnfs_curr_ld = ld_type;
        if (ld_type->set_layoutdriver
            && ld_type->set_layoutdriver(server, mntfh)) {
-                printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
+                printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
-                                __func__, id);
+                        "driver %u.\n", __func__, id);
                module_put(ld_type->owner);
                goto out_no_driver;
        }
@@ -143,11 +143,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
        struct pnfs_layoutdriver_type *tmp;
        if (ld_type->id == 0) {
-                printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+                printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
                return status;
        }
        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
-                printk(KERN_ERR "%s Layout driver must provide "
+                printk(KERN_ERR "NFS: %s Layout driver must provide "
                       "alloc_lseg and free_lseg.\n", __func__);
                return status;
        }
@@ -160,7 +160,7 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
                        ld_type->name);
        } else {
-                printk(KERN_ERR "%s Module with id %d already loaded!\n",
+                printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
                        __func__, ld_type->id);
        }
        spin_unlock(&pnfs_spinlock);
@@ -496,12 +496,12 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 {
        u32 oldseq, newseq;
-        oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
-        newseq = be32_to_cpu(new->stateid.seqid);
+        newseq = be32_to_cpu(new->seqid);
        if ((int)(newseq - oldseq) > 0) {
-                memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+                nfs4_stateid_copy(&lo->plh_stateid, new);
                if (update_barrier) {
-                        u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+                        u32 new_barrier = be32_to_cpu(new->seqid);
                        if ((int)(new_barrier - lo->plh_barrier))
                                lo->plh_barrier = new_barrier;
@@ -525,7 +525,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
                        int lget)
 {
        if ((stateid) &&
-            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+            (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
                return true;
        return lo->plh_block_lgets ||
                test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
@@ -549,11 +549,10 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
                do {
                        seq = read_seqbegin(&open_state->seqlock);
-                        memcpy(dst->data, open_state->stateid.data,
+                        nfs4_stateid_copy(dst, &open_state->stateid);
-                               sizeof(open_state->stateid.data));
                } while (read_seqretry(&open_state->seqlock, seq));
        } else
-                memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+                nfs4_stateid_copy(dst, &lo->plh_stateid);
        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
        return status;
@@ -590,7 +589,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
        max_pages = max_resp_sz >> PAGE_SHIFT;
-        pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+        pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
        if (!pages)
                goto out_err_free;
@@ -760,7 +759,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
                }
        if (!found) {
                struct pnfs_layout_hdr *lo = nfsi->layout;
-                u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+                u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
                /* Since close does not return a layout stateid for use as
                 * a barrier, we choose the worst-case barrier.
@@ -966,8 +965,7 @@ pnfs_update_layout(struct inode *ino,
        }
        /* Do we even need to bother with this? */
-        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
                dprintk("%s matches recall, use MDS\n", __func__);
                goto out_unlock;
        }
@@ -1032,7 +1030,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct nfs4_layoutget_res *res = &lgp->res;
        struct pnfs_layout_segment *lseg;
        struct inode *ino = lo->plh_inode;
-        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        int status = 0;
        /* Inject layout blob into I/O device driver */
@@ -1048,8 +1045,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        spin_lock(&ino->i_lock);
-        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
                dprintk("%s forget reply due to recall\n", __func__);
                goto out_forget_reply;
        }
@@ -1214,6 +1210,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
                }
                data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
        }
+        put_lseg(data->lseg);
        data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
@@ -1227,6 +1224,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
                nfs_list_add_request(data->req, &desc->pg_list);
        nfs_pageio_reset_write_mds(desc);
        desc->pg_recoalesce = 1;
+        put_lseg(data->lseg);
        nfs_writedata_release(data);
 }
@@ -1327,6 +1325,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
                data->mds_ops->rpc_call_done(&data->task, data);
        } else
                pnfs_ld_handle_read_error(data);
+        put_lseg(data->lseg);
        data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1530,8 +1529,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        end_pos = nfsi->layout->plh_lwb;
        nfsi->layout->plh_lwb = 0;
-        memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
+        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
-                sizeof(nfsi->layout->plh_stateid.data));
        spin_unlock(&inode->i_lock);
        data->args.inode = inode;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 53d593a0a4f2..442ebf68eeec 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,11 +94,10 @@ struct pnfs_layoutdriver_type {
        const struct nfs_pageio_ops *pg_read_ops;
        const struct nfs_pageio_ops *pg_write_ops;
-        /* Returns true if layoutdriver wants to divert this request to
+        void (*mark_request_commit) (struct nfs_page *req,
-         * driver's commit routine.
+                                        struct pnfs_layout_segment *lseg);
-         */
+        void (*clear_request_commit) (struct nfs_page *req);
-        bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg);
+        int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock);
-        struct list_head * (*choose_commit_list) (struct nfs_page *req);
        int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
        /*
@@ -229,7 +228,6 @@ struct nfs4_deviceid_node {
        atomic_t                        ref;
 };
-void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
@@ -262,20 +260,6 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)
        return nfss->pnfs_curr_ld != NULL;
 }
-static inline void
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
-{
-        if (lseg) {
-                struct pnfs_layoutdriver_type *ld;
-                ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld;
-                if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) {
-                        set_bit(PG_PNFS_COMMIT, &req->wb_flags);
-                        req->wb_commit_lseg = get_lseg(lseg);
-                }
-        }
-}
 static inline int
 pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
 {
@@ -284,27 +268,42 @@ pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
        return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
 }
-static inline struct list_head *
+static inline bool
-pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
-        struct list_head *rv;
+        struct inode *inode = req->wb_context->dentry->d_inode;
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
-        if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) {
+        if (lseg == NULL || ld->mark_request_commit == NULL)
-                struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode;
+                return false;
+        ld->mark_request_commit(req, lseg);
+        return true;
+}
-                set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
+static inline bool
-                rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req);
+pnfs_clear_request_commit(struct nfs_page *req)
-                /* matched by ref taken when PG_PNFS_COMMIT is set */
+{
-                put_lseg(req->wb_commit_lseg);
+        struct inode *inode = req->wb_context->dentry->d_inode;
-        } else
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
-                rv = mds;
-        return rv;
+        if (ld == NULL || ld->clear_request_commit == NULL)
+                return false;
+        ld->clear_request_commit(req);
+        return true;
 }
-static inline void pnfs_clear_request_commit(struct nfs_page *req)
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
 {
-        if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags))
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
-                put_lseg(req->wb_commit_lseg);
+        int ret;
+        if (ld == NULL || ld->scan_commit_lists == NULL)
+                return 0;
+        ret = ld->scan_commit_lists(inode, max, lock);
+        if (ret != 0)
+                set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
+        return ret;
 }
 /* Should the pNFS client commit and return the layout upon a setattr */
@@ -328,6 +327,13 @@ static inline int pnfs_return_layout(struct inode *ino)
        return 0;
 }
+#ifdef NFS_DEBUG
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+#else
+static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
+{
+}
+#endif /* NFS_DEBUG */
 #else  /* CONFIG_NFS_V4_1 */
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -400,35 +406,35 @@ static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, st
        return false;
 }
-static inline void
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
-{
-}
 static inline int
 pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
 {
        return PNFS_NOT_ATTEMPTED;
 }
-static inline struct list_head *
+static inline bool
-pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
-        return mds;
+        return false;
 }
-static inline void pnfs_clear_request_commit(struct nfs_page *req)
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req)
 {
+        return false;
 }
-static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
 {
        return 0;
 }
-static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
+        return 0;
 }
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 4f359d2a26eb..73f701f1f4d3 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -43,6 +43,7 @@
 static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
 static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+#ifdef NFS_DEBUG
 void
 nfs4_print_deviceid(const struct nfs4_deviceid *id)
 {
@@ -52,6 +53,7 @@ nfs4_print_deviceid(const struct nfs4_deviceid *id)
                p[0], p[1], p[2], p[3]);
 }
 EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+#endif
 static inline u32
 nfs4_deviceid_hash(const struct nfs4_deviceid *id)
@@ -92,7 +94,7 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
 * @clp nfs_client associated with deviceid
 * @id deviceid to look up
 */
-struct nfs4_deviceid_node *
+static struct nfs4_deviceid_node *
 _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
                   const struct nfs_client *clp, const struct nfs4_deviceid *id,
                   long hash)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0c672588fe5a..b63b6f4d14fb 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -358,6 +358,11 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
 }
+static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+        rpc_call_start(task);
+}
 static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        if (nfs_async_handle_expired_key(task))
@@ -372,6 +377,11 @@ nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
        msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
 }
+static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+        rpc_call_start(task);
+}
 static int
 nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
                     struct inode *new_dir)
@@ -651,6 +661,11 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
        msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
+static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+{
+        rpc_call_start(task);
+}
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        if (nfs_async_handle_expired_key(task))
@@ -668,6 +683,11 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
        msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
+static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+{
+        rpc_call_start(task);
+}
 static void
 nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
@@ -721,9 +741,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .create         = nfs_proc_create,
        .remove         = nfs_proc_remove,
        .unlink_setup   = nfs_proc_unlink_setup,
+        .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
        .unlink_done    = nfs_proc_unlink_done,
        .rename         = nfs_proc_rename,
        .rename_setup   = nfs_proc_rename_setup,
+        .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
        .rename_done    = nfs_proc_rename_done,
        .link           = nfs_proc_link,
        .symlink        = nfs_proc_symlink,
@@ -736,8 +758,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .pathconf       = nfs_proc_pathconf,
        .decode_dirent  = nfs2_decode_dirent,
        .read_setup     = nfs_proc_read_setup,
+        .read_rpc_prepare = nfs_proc_read_rpc_prepare,
        .read_done      = nfs_read_done,
        .write_setup    = nfs_proc_write_setup,
+        .write_rpc_prepare = nfs_proc_write_rpc_prepare,
        .write_done     = nfs_write_done,
        .commit_setup   = nfs_proc_commit_setup,
        .lock           = nfs_proc_lock,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index b83e89bf4a74..9a0e8ef4a409 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -65,7 +65,6 @@ void nfs_readdata_free(struct nfs_read_data *p)
 void nfs_readdata_release(struct nfs_read_data *rdata)
 {
-        put_lseg(rdata->lseg);
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
@@ -464,23 +463,14 @@ static void nfs_readpage_release_partial(void *calldata)
        nfs_readdata_release(calldata);
 }
-#if defined(CONFIG_NFS_V4_1)
 void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_read_data *data = calldata;
+        NFS_PROTO(data->inode)->read_rpc_prepare(task, data);
-        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
-                                &data->args.seq_args, &data->res.seq_res,
-                                0, task))
-                return;
-        rpc_call_start(task);
 }
-#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_read_partial_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_read_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_readpage_result_partial,
        .rpc_release = nfs_readpage_release_partial,
 };
@@ -544,9 +534,7 @@ static void nfs_readpage_release_full(void *calldata)
 }
 static const struct rpc_call_ops nfs_read_full_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_read_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_readpage_result_full,
        .rpc_release = nfs_readpage_release_full,
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e3f6b2349411..37412f706b32 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -52,6 +52,8 @@
 #include <linux/nfs_xdr.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
+#include <linux/nsproxy.h>
+#include <linux/rcupdate.h>
 #include <asm/uaccess.h>
@@ -78,7 +80,6 @@ enum {
        Opt_cto, Opt_nocto,
        Opt_ac, Opt_noac,
        Opt_lock, Opt_nolock,
-        Opt_v2, Opt_v3, Opt_v4,
        Opt_udp, Opt_tcp, Opt_rdma,
        Opt_acl, Opt_noacl,
        Opt_rdirplus, Opt_nordirplus,
@@ -96,10 +97,10 @@ enum {
        Opt_namelen,
        Opt_mountport,
        Opt_mountvers,
-        Opt_nfsvers,
        Opt_minorversion,
        /* Mount options that take string arguments */
+        Opt_nfsvers,
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        Opt_lookupcache,
@@ -131,9 +132,6 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_noac, "noac" },
        { Opt_lock, "lock" },
        { Opt_nolock, "nolock" },
-        { Opt_v2, "v2" },
-        { Opt_v3, "v3" },
-        { Opt_v4, "v4" },
        { Opt_udp, "udp" },
        { Opt_tcp, "tcp" },
        { Opt_rdma, "rdma" },
@@ -162,9 +160,10 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_namelen, "namlen=%s" },
        { Opt_mountport, "mountport=%s" },
        { Opt_mountvers, "mountvers=%s" },
+        { Opt_minorversion, "minorversion=%s" },
        { Opt_nfsvers, "nfsvers=%s" },
        { Opt_nfsvers, "vers=%s" },
-        { Opt_minorversion, "minorversion=%s" },
        { Opt_sec, "sec=%s" },
        { Opt_proto, "proto=%s" },
@@ -178,6 +177,9 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_local_lock, "local_lock=%s" },
+        /* The following needs to be listed after all other options */
+        { Opt_nfsvers, "v%s" },
        { Opt_err, NULL }
 };
@@ -258,6 +260,22 @@ static match_table_t nfs_local_lock_tokens = {
        { Opt_local_lock_err, NULL }
 };
+enum {
+        Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
+        Opt_vers_4_1,
+        Opt_vers_err
+};
+static match_table_t nfs_vers_tokens = {
+        { Opt_vers_2, "2" },
+        { Opt_vers_3, "3" },
+        { Opt_vers_4, "4" },
+        { Opt_vers_4_0, "4.0" },
+        { Opt_vers_4_1, "4.1" },
+        { Opt_vers_err, NULL }
+};
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
@@ -619,7 +637,6 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
        struct nfs_client *clp = nfss->nfs_client;
        seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
-        seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
 }
 #else
 static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
@@ -628,6 +645,15 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
 }
 #endif
+static void nfs_show_nfs_version(struct seq_file *m,
+                unsigned int version,
+                unsigned int minorversion)
+{
+        seq_printf(m, ",vers=%u", version);
+        if (version == 4)
+                seq_printf(m, ".%u", minorversion);
+}
 /*
 * Describe the mount options in force on this server representation
 */
@@ -655,7 +681,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        u32 version = clp->rpc_ops->version;
        int local_flock, local_fcntl;
-        seq_printf(m, ",vers=%u", version);
+        nfs_show_nfs_version(m, version, clp->cl_minorversion);
        seq_printf(m, ",rsize=%u", nfss->rsize);
        seq_printf(m, ",wsize=%u", nfss->wsize);
        if (nfss->bsize != 0)
@@ -675,8 +701,10 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                else
                        seq_puts(m, nfs_infop->nostr);
        }
+        rcu_read_lock();
        seq_printf(m, ",proto=%s",
                   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
+        rcu_read_unlock();
        if (version == 4) {
                if (nfss->port != NFS_PORT)
                        seq_printf(m, ",port=%u", nfss->port);
@@ -725,9 +753,11 @@ static int nfs_show_options(struct seq_file *m, struct dentry *root)
        nfs_show_mount_options(m, nfss, 0);
+        rcu_read_lock();
        seq_printf(m, ",addr=%s",
                        rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
                                                        RPC_DISPLAY_ADDR));
+        rcu_read_unlock();
        return 0;
 }
@@ -744,7 +774,6 @@ static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
 #endif
 #endif
-#ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4_1
 static void show_pnfs(struct seq_file *m, struct nfs_server *server)
 {
@@ -754,9 +783,26 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server)
        else
                seq_printf(m, "not configured");
 }
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+        if (nfss->nfs_client && nfss->nfs_client->impl_id) {
+                struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id;
+                seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
+                           "date='%llu,%u'",
+                           impl_id->name, impl_id->domain,
+                           impl_id->date.seconds, impl_id->date.nseconds);
+        }
+}
 #else
-static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#ifdef CONFIG_NFS_V4
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+}
 #endif
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+}
 #endif
 static int nfs_show_devname(struct seq_file *m, struct dentry *root)
@@ -805,6 +851,8 @@ static int nfs_show_stats(struct seq_file *m, struct dentry *root)
        seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+        show_implementation_id(m, nfss);
        seq_printf(m, "\n\tcaps:\t");
        seq_printf(m, "caps=0x%x", nfss->caps);
        seq_printf(m, ",wtmult=%u", nfss->wtmult);
@@ -907,6 +955,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
                data->auth_flavor_len   = 1;
                data->version           = version;
                data->minorversion      = 0;
+                data->net               = current->nsproxy->net_ns;
                security_init_mnt_opts(&data->lsm_opts);
        }
        return data;
@@ -1051,6 +1100,40 @@ static int nfs_parse_security_flavors(char *value,
        return 1;
 }
+static int nfs_parse_version_string(char *string,
+                struct nfs_parsed_mount_data *mnt,
+                substring_t *args)
+{
+        mnt->flags &= ~NFS_MOUNT_VER3;
+        switch (match_token(string, nfs_vers_tokens, args)) {
+        case Opt_vers_2:
+                mnt->version = 2;
+                break;
+        case Opt_vers_3:
+                mnt->flags |= NFS_MOUNT_VER3;
+                mnt->version = 3;
+                break;
+        case Opt_vers_4:
+                /* Backward compatibility option. In future,
+                 * the mount program should always supply
+                 * a NFSv4 minor version number.
+                 */
+                mnt->version = 4;
+                break;
+        case Opt_vers_4_0:
+                mnt->version = 4;
+                mnt->minorversion = 0;
+                break;
+        case Opt_vers_4_1:
+                mnt->version = 4;
+                mnt->minorversion = 1;
+                break;
+        default:
+                return 0;
+        }
+        return 1;
+}
 static int nfs_get_option_str(substring_t args[], char **option)
 {
        kfree(*option);
@@ -1156,18 +1239,6 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
                                       NFS_MOUNT_LOCAL_FCNTL);
                        break;
-                case Opt_v2:
-                        mnt->flags &= ~NFS_MOUNT_VER3;
-                        mnt->version = 2;
-                        break;
-                case Opt_v3:
-                        mnt->flags |= NFS_MOUNT_VER3;
-                        mnt->version = 3;
-                        break;
-                case Opt_v4:
-                        mnt->flags &= ~NFS_MOUNT_VER3;
-                        mnt->version = 4;
-                        break;
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1294,26 +1365,6 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_invalid_value;
                        mnt->mount_server.version = option;
                        break;
-                case Opt_nfsvers:
-                        if (nfs_get_option_ul(args, &option))
-                                goto out_invalid_value;
-                        switch (option) {
-                        case NFS2_VERSION:
-                                mnt->flags &= ~NFS_MOUNT_VER3;
-                                mnt->version = 2;
-                                break;
-                        case NFS3_VERSION:
-                                mnt->flags |= NFS_MOUNT_VER3;
-                                mnt->version = 3;
-                                break;
-                        case NFS4_VERSION:
-                                mnt->flags &= ~NFS_MOUNT_VER3;
-                                mnt->version = 4;
-                                break;
-                        default:
-                                goto out_invalid_value;
-                        }
-                        break;
                case Opt_minorversion:
                        if (nfs_get_option_ul(args, &option))
                                goto out_invalid_value;
@@ -1325,6 +1376,15 @@ static int nfs_parse_mount_options(char *raw,
                /*
                 * options that take text values
                 */
+                case Opt_nfsvers:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        rc = nfs_parse_version_string(string, mnt, args);
+                        kfree(string);
+                        if (!rc)
+                                goto out_invalid_value;
+                        break;
                case Opt_sec:
                        string = match_strdup(args);
                        if (string == NULL)
@@ -1404,7 +1464,7 @@ static int nfs_parse_mount_options(char *raw,
                        if (string == NULL)
                                goto out_nomem;
                        mnt->nfs_server.addrlen =
-                                rpc_pton(string, strlen(string),
+                                rpc_pton(mnt->net, string, strlen(string),
                                        (struct sockaddr *)
                                        &mnt->nfs_server.address,
                                        sizeof(mnt->nfs_server.address));
@@ -1426,7 +1486,7 @@ static int nfs_parse_mount_options(char *raw,
                        if (string == NULL)
                                goto out_nomem;
                        mnt->mount_server.addrlen =
-                                rpc_pton(string, strlen(string),
+                                rpc_pton(mnt->net, string, strlen(string),
                                        (struct sockaddr *)
                                        &mnt->mount_server.address,
                                        sizeof(mnt->mount_server.address));
@@ -1515,6 +1575,9 @@ static int nfs_parse_mount_options(char *raw,
        if (!sloppy && invalid_option)
                return 0;
+        if (mnt->minorversion && mnt->version != 4)
+                goto out_minorversion_mismatch;
        /*
         * verify that any proto=/mountproto= options match the address
         * familiies in the addr=/mountaddr= options.
@@ -1548,6 +1611,10 @@ out_invalid_address:
 out_invalid_value:
        printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
        return 0;
+out_minorversion_mismatch:
+        printk(KERN_INFO "NFS: mount option vers=%u does not support "
+                         "minorversion=%u\n", mnt->version, mnt->minorversion);
+        return 0;
 out_nomem:
        printk(KERN_INFO "NFS: not enough memory to parse option\n");
        return 0;
@@ -1621,6 +1688,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
                .auth_flav_len  = &server_authlist_len,
                .auth_flavs     = server_authlist,
+                .net            = args->net,
        };
        int status;
@@ -2046,7 +2114,7 @@ static inline void nfs_initialise_sb(struct super_block *sb)
        /* We probably want something more informative here */
        snprintf(sb->s_id, sizeof(sb->s_id),
-                 "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+                 "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
        if (sb->s_blocksize == 0)
                sb->s_blocksize = nfs_block_bits(server->wsize,
@@ -2498,12 +2566,6 @@ static int nfs4_validate_text_mount_data(void *options,
                return -EINVAL;
        }
-        if (args->client_address == NULL) {
-                dfprintk(MOUNT,
-                         "NFS4: mount program didn't pass callback address\n");
-                return -EINVAL;
-        }
        return nfs_parse_devname(dev_name,
                                   &args->nfs_server.hostname,
                                   NFS4_MAXNAMLEN,
@@ -2662,8 +2724,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs4_fill_super(s);
-                nfs_fscache_get_super_cookie(
+                nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
-                        s, data ? data->fscache_uniq : NULL, NULL);
        }
        mntroot = nfs4_get_root(s, mntfh, dev_name);
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 978aaeb8a093..ad4d2e787b20 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,7 +32,6 @@ static ctl_table nfs_cb_sysctls[] = {
                .extra1 = (int *)&nfs_set_port_min,
                .extra2 = (int *)&nfs_set_port_max,
        },
-#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
        {
                .procname = "idmap_cache_timeout",
                .data = &nfs_idmap_cache_timeout,
@@ -40,7 +39,6 @@ static ctl_table nfs_cb_sysctls[] = {
                .mode = 0644,
                .proc_handler = proc_dointvec_jiffies,
        },
-#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
        {
                .procname       = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 4f9319a2e567..3210a03342f9 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -20,15 +20,6 @@
 #include "iostat.h"
 #include "delegation.h"
-struct nfs_unlinkdata {
-        struct hlist_node list;
-        struct nfs_removeargs args;
-        struct nfs_removeres res;
-        struct inode *dir;
-        struct rpc_cred *cred;
-        struct nfs_fattr dir_attr;
-};
 /**
 * nfs_free_unlinkdata - release data from a sillydelete operation.
 * @data: pointer to unlink structure.
@@ -107,25 +98,16 @@ static void nfs_async_unlink_release(void *calldata)
        nfs_sb_deactive(sb);
 }
-#if defined(CONFIG_NFS_V4_1)
+static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
-void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_unlinkdata *data = calldata;
-        struct nfs_server *server = NFS_SERVER(data->dir);
+        NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);
-        if (nfs4_setup_sequence(server, &data->args.seq_args,
-                                &data->res.seq_res, 1, task))
-                return;
-        rpc_call_start(task);
 }
-#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_unlink_ops = {
        .rpc_call_done = nfs_async_unlink_done,
        .rpc_release = nfs_async_unlink_release,
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_unlink_prepare,
-#endif /* CONFIG_NFS_V4_1 */
 };
 static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
@@ -341,18 +323,6 @@ nfs_cancel_async_unlink(struct dentry *dentry)
        spin_unlock(&dentry->d_lock);
 }
-struct nfs_renamedata {
-        struct nfs_renameargs   args;
-        struct nfs_renameres    res;
-        struct rpc_cred         *cred;
-        struct inode            *old_dir;
-        struct dentry           *old_dentry;
-        struct nfs_fattr        old_fattr;
-        struct inode            *new_dir;
-        struct dentry           *new_dentry;
-        struct nfs_fattr        new_fattr;
-};
 /**
 * nfs_async_rename_done - Sillyrename post-processing
 * @task: rpc_task of the sillyrename
@@ -403,25 +373,16 @@ static void nfs_async_rename_release(void *calldata)
        kfree(data);
 }
-#if defined(CONFIG_NFS_V4_1)
 static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_renamedata *data = calldata;
-        struct nfs_server *server = NFS_SERVER(data->old_dir);
+        NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);
-        if (nfs4_setup_sequence(server, &data->args.seq_args,
-                                &data->res.seq_res, 1, task))
-                return;
-        rpc_call_start(task);
 }
-#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_rename_ops = {
        .rpc_call_done = nfs_async_rename_done,
        .rpc_release = nfs_async_rename_release,
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_rename_prepare,
-#endif /* CONFIG_NFS_V4_1 */
 };
 /**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 834f0fe96f89..2c68818f68ac 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -100,7 +100,6 @@ void nfs_writedata_free(struct nfs_write_data *p)
 void nfs_writedata_release(struct nfs_write_data *wdata)
 {
-        put_lseg(wdata->lseg);
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
@@ -236,10 +235,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo
                req = nfs_page_find_request_locked(page);
                if (req == NULL)
                        break;
-                if (nfs_set_page_tag_locked(req))
+                if (nfs_lock_request_dontget(req))
                        break;
                /* Note: If we hold the page lock, as is the case in nfs_writepage,
-                 *       then the call to nfs_set_page_tag_locked() will always
+                 *       then the call to nfs_lock_request_dontget() will always
                 *       succeed provided that someone hasn't already marked the
                 *       request as dirty (in which case we don't care).
                 */
@@ -375,21 +374,14 @@ out_err:
 /*
 * Insert a write request into an inode
 */
-static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        int error;
-        error = radix_tree_preload(GFP_NOFS);
-        if (error != 0)
-                goto out;
        /* Lock the request! */
        nfs_lock_request_dontget(req);
        spin_lock(&inode->i_lock);
-        error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
-        BUG_ON(error);
        if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
                inode->i_version++;
        set_bit(PG_MAPPED, &req->wb_flags);
@@ -397,12 +389,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        set_page_private(req->wb_page, (unsigned long)req);
        nfsi->npages++;
        kref_get(&req->wb_kref);
-        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
-                                NFS_PAGE_TAG_LOCKED);
        spin_unlock(&inode->i_lock);
-        radix_tree_preload_end();
-out:
-        return error;
 }
 /*
@@ -419,7 +406,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
        clear_bit(PG_MAPPED, &req->wb_flags);
-        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
        nfsi->npages--;
        spin_unlock(&inode->i_lock);
        nfs_release_request(req);
@@ -432,39 +418,90 @@ nfs_mark_request_dirty(struct nfs_page *req)
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-/*
+/**
- * Add a request to the inode's commit list.
+ * nfs_request_add_commit_list - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @head: commit list head
+ *
+ * This sets the PG_CLEAN bit, updates the inode global count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must _not_ hold the inode->i_lock, but must be
+ * holding the nfs_page lock.
 */
-static void
+void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head)
 {
        struct inode *inode = req->wb_context->dentry->d_inode;
-        struct nfs_inode *nfsi = NFS_I(inode);
-        spin_lock(&inode->i_lock);
        set_bit(PG_CLEAN, &(req)->wb_flags);
-        radix_tree_tag_set(&nfsi->nfs_page_tree,
+        spin_lock(&inode->i_lock);
-                        req->wb_index,
+        nfs_list_add_request(req, head);
-                        NFS_PAGE_TAG_COMMIT);
+        NFS_I(inode)->ncommit++;
-        nfsi->ncommit++;
        spin_unlock(&inode->i_lock);
-        pnfs_mark_request_commit(req, lseg);
        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
        inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
-static int
+/**
+ * nfs_request_remove_commit_list - Remove request from a commit list
+ * @req: pointer to a nfs_page
+ *
+ * This clears the PG_CLEAN bit, and updates the inode global count of
+ * number of outstanding requests requiring a commit
+ * It does not update the MM page stats.
+ *
+ * The caller _must_ hold the inode->i_lock and the nfs_page lock.
+ */
+void
+nfs_request_remove_commit_list(struct nfs_page *req)
+{
+        struct inode *inode = req->wb_context->dentry->d_inode;
+        if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
+                return;
+        nfs_list_remove_request(req);
+        NFS_I(inode)->ncommit--;
+}
+EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
+/*
+ * Add a request to the inode's commit list.
+ */
+static void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+        struct inode *inode = req->wb_context->dentry->d_inode;
+        if (pnfs_mark_request_commit(req, lseg))
+                return;
+        nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list);
+}
+static void
+nfs_clear_page_commit(struct page *page)
+{
+        dec_zone_page_state(page, NR_UNSTABLE_NFS);
+        dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+}
+static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
-        struct page *page = req->wb_page;
+        if (test_bit(PG_CLEAN, &req->wb_flags)) {
+                struct inode *inode = req->wb_context->dentry->d_inode;
-        if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+                if (!pnfs_clear_request_commit(req)) {
-                dec_zone_page_state(page, NR_UNSTABLE_NFS);
+                        spin_lock(&inode->i_lock);
-                dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+                        nfs_request_remove_commit_list(req);
-                return 1;
+                        spin_unlock(&inode->i_lock);
+                }
+                nfs_clear_page_commit(req->wb_page);
        }
-        return 0;
 }
 static inline
@@ -491,15 +528,14 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,
        return 0;
 }
 #else
-static inline void
+static void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
 }
-static inline int
+static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
-        return 0;
 }
 static inline
@@ -520,46 +556,65 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,
 static int
 nfs_need_commit(struct nfs_inode *nfsi)
 {
-        return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+        return nfsi->ncommit > 0;
+}
+/* i_lock held by caller */
+static int
+nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
+                spinlock_t *lock)
+{
+        struct nfs_page *req, *tmp;
+        int ret = 0;
+        list_for_each_entry_safe(req, tmp, src, wb_list) {
+                if (!nfs_lock_request(req))
+                        continue;
+                if (cond_resched_lock(lock))
+                        list_safe_reset_next(req, tmp, wb_list);
+                nfs_request_remove_commit_list(req);
+                nfs_list_add_request(req, dst);
+                ret++;
+                if (ret == max)
+                        break;
+        }
+        return ret;
 }
 /*
 * nfs_scan_commit - Scan an inode for commit requests
 * @inode: NFS inode to scan
 * @dst: destination list
- * @idx_start: lower bound of page->index to scan.
- * @npages: idx_start + npages sets the upper bound to scan.
 *
 * Moves requests from the inode's 'commit' request list.
 * The requests are *not* checked to ensure that they form a contiguous set.
 */
 static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
+nfs_scan_commit(struct inode *inode, struct list_head *dst)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        int ret;
+        int ret = 0;
-        if (!nfs_need_commit(nfsi))
-                return 0;
        spin_lock(&inode->i_lock);
-        ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+        if (nfsi->ncommit > 0) {
-        if (ret > 0)
+                const int max = INT_MAX;
-                nfsi->ncommit -= ret;
-        spin_unlock(&inode->i_lock);
-        if (nfs_need_commit(NFS_I(inode)))
-                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+                ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max,
+                                &inode->i_lock);
+                ret += pnfs_scan_commit_lists(inode, max - ret,
+                                &inode->i_lock);
+        }
+        spin_unlock(&inode->i_lock);
        return ret;
 }
 #else
 static inline int nfs_need_commit(struct nfs_inode *nfsi)
 {
        return 0;
 }
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst)
 {
        return 0;
 }
@@ -604,7 +659,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
                    || end < req->wb_offset)
                        goto out_flushme;
-                if (nfs_set_page_tag_locked(req))
+                if (nfs_lock_request_dontget(req))
                        break;
                /* The request is locked, so wait and then retry */
@@ -616,13 +671,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
                spin_lock(&inode->i_lock);
        }
-        if (nfs_clear_request_commit(req) &&
-            radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
-                                 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
-                NFS_I(inode)->ncommit--;
-                pnfs_clear_request_commit(req);
-        }
        /* Okay, the request matches. Update the region */
        if (offset < req->wb_offset) {
                req->wb_offset = offset;
@@ -634,6 +682,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
                req->wb_bytes = rqend - req->wb_offset;
 out_unlock:
        spin_unlock(&inode->i_lock);
+        nfs_clear_request_commit(req);
        return req;
 out_flushme:
        spin_unlock(&inode->i_lock);
@@ -655,7 +704,6 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
 {
        struct inode *inode = page->mapping->host;
        struct nfs_page *req;
-        int error;
        req = nfs_try_to_update_request(inode, page, offset, bytes);
        if (req != NULL)
@@ -663,11 +711,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
        req = nfs_create_request(ctx, inode, page, offset, bytes);
        if (IS_ERR(req))
                goto out;
-        error = nfs_inode_add_request(inode, req);
+        nfs_inode_add_request(inode, req);
-        if (error != 0) {
-                nfs_release_request(req);
-                req = ERR_PTR(error);
-        }
 out:
        return req;
 }
@@ -684,7 +728,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
        nfs_grow_file(page, offset, count);
        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
        nfs_mark_request_dirty(req);
-        nfs_clear_page_tag_locked(req);
+        nfs_unlock_request(req);
        return 0;
 }
@@ -777,7 +821,7 @@ static void nfs_writepage_release(struct nfs_page *req,
        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
                nfs_inode_remove_request(req);
-        nfs_clear_page_tag_locked(req);
+        nfs_unlock_request(req);
        nfs_end_page_writeback(page);
 }
@@ -925,7 +969,7 @@ static void nfs_redirty_request(struct nfs_page *req)
        struct page *page = req->wb_page;
        nfs_mark_request_dirty(req);
-        nfs_clear_page_tag_locked(req);
+        nfs_unlock_request(req);
        nfs_end_page_writeback(page);
 }
@@ -1128,23 +1172,14 @@ out:
        nfs_writedata_release(calldata);
 }
-#if defined(CONFIG_NFS_V4_1)
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data *data = calldata;
+        NFS_PROTO(data->inode)->write_rpc_prepare(task, data);
-        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
-                                &data->args.seq_args,
-                                &data->res.seq_res, 1, task))
-                return;
-        rpc_call_start(task);
 }
-#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_write_partial_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_writeback_done_partial,
        .rpc_release = nfs_writeback_release_partial,
 };
@@ -1199,16 +1234,14 @@ static void nfs_writeback_release_full(void *calldata)
 remove_request:
                nfs_inode_remove_request(req);
        next:
-                nfs_clear_page_tag_locked(req);
+                nfs_unlock_request(req);
                nfs_end_page_writeback(page);
        }
        nfs_writedata_release(calldata);
 }
 static const struct rpc_call_ops nfs_write_full_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_writeback_done_full,
        .rpc_release = nfs_writeback_release_full,
 };
@@ -1325,7 +1358,6 @@ void nfs_commitdata_release(void *data)
 {
        struct nfs_write_data *wdata = data;
-        put_lseg(wdata->lseg);
        put_nfs_open_context(wdata->args.context);
        nfs_commit_free(wdata);
 }
@@ -1411,7 +1443,7 @@ void nfs_retry_commit(struct list_head *page_list,
                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
                dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
                             BDI_RECLAIMABLE);
-                nfs_clear_page_tag_locked(req);
+                nfs_unlock_request(req);
        }
 }
 EXPORT_SYMBOL_GPL(nfs_retry_commit);
@@ -1460,7 +1492,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-                nfs_clear_request_commit(req);
+                nfs_clear_page_commit(req->wb_page);
                dprintk("NFS:       commit (%s/%lld %d@%lld)",
                        req->wb_context->dentry->d_sb->s_id,
@@ -1486,7 +1518,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
                dprintk(" mismatch\n");
                nfs_mark_request_dirty(req);
        next:
-                nfs_clear_page_tag_locked(req);
+                nfs_unlock_request(req);
        }
 }
 EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
@@ -1501,9 +1533,7 @@ static void nfs_commit_release(void *calldata)
 }
 static const struct rpc_call_ops nfs_commit_ops = {
-#if defined(CONFIG_NFS_V4_1)
        .rpc_call_prepare = nfs_write_prepare,
-#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_commit_done,
        .rpc_release = nfs_commit_release,
 };
@@ -1517,7 +1547,7 @@ int nfs_commit_inode(struct inode *inode, int how)
        res = nfs_commit_set_lock(NFS_I(inode), may_wait);
        if (res <= 0)
                goto out_mark_dirty;
-        res = nfs_scan_commit(inode, &head, 0, 0);
+        res = nfs_scan_commit(inode, &head);
        if (res) {
                int error;
@@ -1635,6 +1665,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
                if (req == NULL)
                        break;
                if (nfs_lock_request_dontget(req)) {
+                        nfs_clear_request_commit(req);
                        nfs_inode_remove_request(req);
                        /*
                         * In case nfs_inode_remove_request has marked the
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6f3ebb48b12f..0e262f32ac41 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -605,24 +605,24 @@ static struct rpc_version nfs_cb_version4 = {
        .procs                  = nfs4_cb_procedures
 };
-static struct rpc_version *nfs_cb_version[] = {
+static const struct rpc_version *nfs_cb_version[] = {
        &nfs_cb_version4,
 };
-static struct rpc_program cb_program;
+static const struct rpc_program cb_program;
 static struct rpc_stat cb_stats = {
        .program                = &cb_program
 };
 #define NFS4_CALLBACK 0x40000000
-static struct rpc_program cb_program = {
+static const struct rpc_program cb_program = {
        .name                   = "nfs4_cb",
        .number                 = NFS4_CALLBACK,
        .nrvers                 = ARRAY_SIZE(nfs_cb_version),
        .version                = nfs_cb_version,
        .stats                  = &cb_stats,
-        .pipe_dir_name          = "/nfsd4_cb",
+        .pipe_dir_name          = "nfsd4_cb",
 };
 static int max_cb_time(void)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e8c98f009670..c5cddd659429 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1308,7 +1308,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
        else
                goto out_err;
-        conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+        conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val,
                                            se->se_callback_addr_len,
                                            (struct sockaddr *)&conn->cb_addr,
                                            sizeof(conn->cb_addr));
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 748eda93ce59..64c24af8d7ea 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -223,7 +223,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
        if (qword_get(&buf, fo_path, size) < 0)
                return -EINVAL;
-        if (rpc_pton(fo_path, size, sap, salen) == 0)
+        if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0)
                return -EINVAL;
        return nlmsvc_unlock_all_by_ip(sap);
@@ -722,7 +722,7 @@ static ssize_t __write_ports_addxprt(char *buf)
        nfsd_serv->sv_nrthreads--;
        return 0;
 out_close:
-        xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
+        xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port);
        if (xprt != NULL) {
                svc_close_xprt(xprt);
                svc_xprt_put(xprt);
@@ -748,7 +748,7 @@ static ssize_t __write_ports_delxprt(char *buf)
        if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
                return -EINVAL;
-        xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
+        xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);
        if (xprt == NULL)
                return -ENOTCONN;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index eda7d7e55e05..fce472f5f39e 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -251,13 +251,13 @@ static void nfsd_shutdown(void)
        nfsd_up = false;
 }
-static void nfsd_last_thread(struct svc_serv *serv)
+static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 {
        /* When last nfsd thread exits we need to do some clean-up */
        nfsd_serv = NULL;
        nfsd_shutdown();
-        svc_rpcb_cleanup(serv);
+        svc_rpcb_cleanup(serv, net);
        printk(KERN_WARNING "nfsd: last server has exited, flushing export "
                            "cache\n");
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index a2e2402b2afb..6d4521feb6e3 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/nfsd/stats.h>
+#include <net/net_namespace.h>
 #include "nfsd.h"
@@ -94,11 +95,11 @@ static const struct file_operations nfsd_proc_fops = {
 void
 nfsd_stat_init(void)
 {
-        svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops);
+        svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops);
 }
 void
 nfsd_stat_shutdown(void)
 {
-        svc_proc_unregister("nfsd");
+        svc_proc_unregister(&init_net, "nfsd");
 }
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index ee188158a224..c887b1378f7e 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -447,7 +447,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
        return event;
 }
-__init int fsnotify_notification_init(void)
+static __init int fsnotify_notification_init(void)
 {
        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
@@ -461,4 +461,3 @@ __init int fsnotify_notification_init(void)
        return 0;
 }
 subsys_initcall(fsnotify_notification_init);
diff --git a/fs/pipe.c b/fs/pipe.c
index fe0502f9beb2..25feaa3faac0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -13,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/log2.h>
 #include <linux/mount.h>
+#include <linux/magic.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index cea4623f1ed6..5e325a42e33d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -18,7 +18,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/posix_acl.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/errno.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c602b8d20f06..fbb53c249086 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -462,59 +462,56 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        /* convert nsec -> ticks */
        start_time = nsec_to_clock_t(start_time);
-        seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
+        seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
+        seq_put_decimal_ll(m, ' ', ppid);
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
+        seq_put_decimal_ll(m, ' ', pgid);
-                pid_nr_ns(pid, ns),
+        seq_put_decimal_ll(m, ' ', sid);
-                tcomm,
+        seq_put_decimal_ll(m, ' ', tty_nr);
-                state,
+        seq_put_decimal_ll(m, ' ', tty_pgrp);
-                ppid,
+        seq_put_decimal_ull(m, ' ', task->flags);
-                pgid,
+        seq_put_decimal_ull(m, ' ', min_flt);
-                sid,
+        seq_put_decimal_ull(m, ' ', cmin_flt);
-                tty_nr,
+        seq_put_decimal_ull(m, ' ', maj_flt);
-                tty_pgrp,
+        seq_put_decimal_ull(m, ' ', cmaj_flt);
-                task->flags,
+        seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
-                min_flt,
+        seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
-                cmin_flt,
+        seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
-                maj_flt,
+        seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
-                cmaj_flt,
+        seq_put_decimal_ll(m, ' ', priority);
-                cputime_to_clock_t(utime),
+        seq_put_decimal_ll(m, ' ', nice);
-                cputime_to_clock_t(stime),
+        seq_put_decimal_ll(m, ' ', num_threads);
-                cputime_to_clock_t(cutime),
+        seq_put_decimal_ull(m, ' ', 0);
-                cputime_to_clock_t(cstime),
+        seq_put_decimal_ull(m, ' ', start_time);
-                priority,
+        seq_put_decimal_ull(m, ' ', vsize);
-                nice,
+        seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0);
-                num_threads,
+        seq_put_decimal_ull(m, ' ', rsslim);
-                start_time,
+        seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
-                vsize,
+        seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
-                mm ? get_mm_rss(mm) : 0,
+        seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
-                rsslim,
+        seq_put_decimal_ull(m, ' ', esp);
-                mm ? (permitted ? mm->start_code : 1) : 0,
+        seq_put_decimal_ull(m, ' ', eip);
-                mm ? (permitted ? mm->end_code : 1) : 0,
+        /* The signal information here is obsolete.
-                (permitted && mm) ? mm->start_stack : 0,
+         * It must be decimal for Linux 2.0 compatibility.
-                esp,
+         * Use /proc/#/status for real-time signals.
-                eip,
+         */
-                /* The signal information here is obsolete.
+        seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
-                 * It must be decimal for Linux 2.0 compatibility.
+        seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
-                 * Use /proc/#/status for real-time signals.
+        seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
-                 */
+        seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
-                task->pending.signal.sig[0] & 0x7fffffffUL,
+        seq_put_decimal_ull(m, ' ', wchan);
-                task->blocked.sig[0] & 0x7fffffffUL,
+        seq_put_decimal_ull(m, ' ', 0);
-                sigign      .sig[0] & 0x7fffffffUL,
+        seq_put_decimal_ull(m, ' ', 0);
-                sigcatch    .sig[0] & 0x7fffffffUL,
+        seq_put_decimal_ll(m, ' ', task->exit_signal);
-                wchan,
+        seq_put_decimal_ll(m, ' ', task_cpu(task));
-                0UL,
+        seq_put_decimal_ull(m, ' ', task->rt_priority);
-                0UL,
+        seq_put_decimal_ull(m, ' ', task->policy);
-                task->exit_signal,
+        seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
-                task_cpu(task),
+        seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
-                task->rt_priority,
+        seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
-                task->policy,
+        seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0);
-                (unsigned long long)delayacct_blkio_ticks(task),
+        seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0);
-                cputime_to_clock_t(gtime),
+        seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0);
-                cputime_to_clock_t(cgtime),
+        seq_putc(m, '\n');
-                (mm && permitted) ? mm->start_data : 0,
-                (mm && permitted) ? mm->end_data : 0,
-                (mm && permitted) ? mm->start_brk : 0);
        if (mm)
                mmput(mm);
        return 0;
@@ -542,8 +539,20 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
        }
-        seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+        /*
-                        size, resident, shared, text, data);
+         * For quick read, open code by putting numbers directly
+         * expected format is
+         * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+         *               size, resident, shared, text, data);
+         */
+        seq_put_decimal_ull(m, 0, size);
+        seq_put_decimal_ull(m, ' ', resident);
+        seq_put_decimal_ull(m, ' ', shared);
+        seq_put_decimal_ull(m, ' ', text);
+        seq_put_decimal_ull(m, ' ', 0);
+        seq_put_decimal_ull(m, ' ', text);
+        seq_put_decimal_ull(m, ' ', 0);
+        seq_putc(m, '\n');
        return 0;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c44efe19798f..5f79bb8b4c60 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,12 +10,15 @@
 */
 #include <linux/proc_fs.h>
+struct  ctl_table_header;
 extern struct proc_dir_entry proc_root;
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
 #endif
 #ifdef CONFIG_NET
 extern int proc_net_init(void);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e5e69aff6c69..86c67eee439f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -157,7 +157,8 @@ static int kcore_update_ram(void)
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /* calculate vmemmap's address from given system ram pfn and register it */
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
 {
        unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
        unsigned long nr_pages = ent->size >> PAGE_SHIFT;
@@ -189,7 +190,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
 }
 #else
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
 {
        return 1;
 }
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 27da860115c6..3551f1f839eb 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -53,7 +53,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
        ei->ns_ops    = ns_ops;
        ei->ns        = ns;
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, NULL))
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 67bbf6e4e197..21d836f40292 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include "internal.h"
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -26,6 +27,371 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
        wake_up_interruptible(&poll->wait);
 }
+static struct ctl_table root_table[] = {
+        {
+                .procname = "",
+                .mode = S_IFDIR|S_IRUGO|S_IXUGO,
+        },
+        { }
+};
+static struct ctl_table_root sysctl_table_root = {
+        .default_set.dir.header = {
+                {{.count = 1,
+                  .nreg = 1,
+                  .ctl_table = root_table }},
+                .ctl_table_arg = root_table,
+                .root = &sysctl_table_root,
+                .set = &sysctl_table_root.default_set,
+        },
+};
+static DEFINE_SPINLOCK(sysctl_lock);
+static void drop_sysctl_table(struct ctl_table_header *header);
+static int sysctl_follow_link(struct ctl_table_header **phead,
+        struct ctl_table **pentry, struct nsproxy *namespaces);
+static int insert_links(struct ctl_table_header *head);
+static void put_links(struct ctl_table_header *header);
+static void sysctl_print_dir(struct ctl_dir *dir)
+{
+        if (dir->header.parent)
+                sysctl_print_dir(dir->header.parent);
+        printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname);
+}
+static int namecmp(const char *name1, int len1, const char *name2, int len2)
+{
+        int minlen;
+        int cmp;
+        minlen = len1;
+        if (minlen > len2)
+                minlen = len2;
+        cmp = memcmp(name1, name2, minlen);
+        if (cmp == 0)
+                cmp = len1 - len2;
+        return cmp;
+}
+/* Called under sysctl_lock */
+static struct ctl_table *find_entry(struct ctl_table_header **phead,
+        struct ctl_dir *dir, const char *name, int namelen)
+{
+        struct ctl_table_header *head;
+        struct ctl_table *entry;
+        struct rb_node *node = dir->root.rb_node;
+        while (node)
+        {
+                struct ctl_node *ctl_node;
+                const char *procname;
+                int cmp;
+                ctl_node = rb_entry(node, struct ctl_node, node);
+                head = ctl_node->header;
+                entry = &head->ctl_table[ctl_node - head->node];
+                procname = entry->procname;
+                cmp = namecmp(name, namelen, procname, strlen(procname));
+                if (cmp < 0)
+                        node = node->rb_left;
+                else if (cmp > 0)
+                        node = node->rb_right;
+                else {
+                        *phead = head;
+                        return entry;
+                }
+        }
+        return NULL;
+}
+static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+        struct rb_node *node = &head->node[entry - head->ctl_table].node;
+        struct rb_node **p = &head->parent->root.rb_node;
+        struct rb_node *parent = NULL;
+        const char *name = entry->procname;
+        int namelen = strlen(name);
+        while (*p) {
+                struct ctl_table_header *parent_head;
+                struct ctl_table *parent_entry;
+                struct ctl_node *parent_node;
+                const char *parent_name;
+                int cmp;
+                parent = *p;
+                parent_node = rb_entry(parent, struct ctl_node, node);
+                parent_head = parent_node->header;
+                parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
+                parent_name = parent_entry->procname;
+                cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
+                if (cmp < 0)
+                        p = &(*p)->rb_left;
+                else if (cmp > 0)
+                        p = &(*p)->rb_right;
+                else {
+                        printk(KERN_ERR "sysctl duplicate entry: ");
+                        sysctl_print_dir(head->parent);
+                        printk(KERN_CONT "/%s\n", entry->procname);
+                        return -EEXIST;
+                }
+        }
+        rb_link_node(node, parent, p);
+        return 0;
+}
+static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+        struct rb_node *node = &head->node[entry - head->ctl_table].node;
+        rb_erase(node, &head->parent->root);
+}
+static void init_header(struct ctl_table_header *head,
+        struct ctl_table_root *root, struct ctl_table_set *set,
+        struct ctl_node *node, struct ctl_table *table)
+{
+        head->ctl_table = table;
+        head->ctl_table_arg = table;
+        head->used = 0;
+        head->count = 1;
+        head->nreg = 1;
+        head->unregistering = NULL;
+        head->root = root;
+        head->set = set;
+        head->parent = NULL;
+        head->node = node;
+        if (node) {
+                struct ctl_table *entry;
+                for (entry = table; entry->procname; entry++, node++) {
+                        rb_init_node(&node->node);
+                        node->header = head;
+                }
+        }
+}
+static void erase_header(struct ctl_table_header *head)
+{
+        struct ctl_table *entry;
+        for (entry = head->ctl_table; entry->procname; entry++)
+                erase_entry(head, entry);
+}
+static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
+{
+        struct ctl_table *entry;
+        int err;
+        dir->header.nreg++;
+        header->parent = dir;
+        err = insert_links(header);
+        if (err)
+                goto fail_links;
+        for (entry = header->ctl_table; entry->procname; entry++) {
+                err = insert_entry(header, entry);
+                if (err)
+                        goto fail;
+        }
+        return 0;
+fail:
+        erase_header(header);
+        put_links(header);
+fail_links:
+        header->parent = NULL;
+        drop_sysctl_table(&dir->header);
+        return err;
+}
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+        if (unlikely(p->unregistering))
+                return 0;
+        p->used++;
+        return 1;
+}
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+        if (!--p->used)
+                if (unlikely(p->unregistering))
+                        complete(p->unregistering);
+}
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+        /*
+         * if p->used is 0, nobody will ever touch that entry again;
+         * we'll eliminate all paths to it before dropping sysctl_lock
+         */
+        if (unlikely(p->used)) {
+                struct completion wait;
+                init_completion(&wait);
+                p->unregistering = &wait;
+                spin_unlock(&sysctl_lock);
+                wait_for_completion(&wait);
+                spin_lock(&sysctl_lock);
+        } else {
+                /* anything non-NULL; we'll never dereference it */
+                p->unregistering = ERR_PTR(-EINVAL);
+        }
+        /*
+         * do not remove from the list until nobody holds it; walking the
+         * list in do_sysctl() relies on that.
+         */
+        erase_header(p);
+}
+static void sysctl_head_get(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        head->count++;
+        spin_unlock(&sysctl_lock);
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        if (!--head->count)
+                kfree_rcu(head, rcu);
+        spin_unlock(&sysctl_lock);
+}
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+        if (!head)
+                BUG();
+        spin_lock(&sysctl_lock);
+        if (!use_table(head))
+                head = ERR_PTR(-ENOENT);
+        spin_unlock(&sysctl_lock);
+        return head;
+}
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+        if (!head)
+                return;
+        spin_lock(&sysctl_lock);
+        unuse_table(head);
+        spin_unlock(&sysctl_lock);
+}
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+        struct ctl_table_set *set = &root->default_set;
+        if (root->lookup)
+                set = root->lookup(root, namespaces);
+        return set;
+}
+static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+                                      struct ctl_dir *dir,
+                                      const char *name, int namelen)
+{
+        struct ctl_table_header *head;
+        struct ctl_table *entry;
+        spin_lock(&sysctl_lock);
+        entry = find_entry(&head, dir, name, namelen);
+        if (entry && use_table(head))
+                *phead = head;
+        else
+                entry = NULL;
+        spin_unlock(&sysctl_lock);
+        return entry;
+}
+static struct ctl_node *first_usable_entry(struct rb_node *node)
+{
+        struct ctl_node *ctl_node;
+        for (;node; node = rb_next(node)) {
+                ctl_node = rb_entry(node, struct ctl_node, node);
+                if (use_table(ctl_node->header))
+                        return ctl_node;
+        }
+        return NULL;
+}
+static void first_entry(struct ctl_dir *dir,
+        struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+        struct ctl_table_header *head = NULL;
+        struct ctl_table *entry = NULL;
+        struct ctl_node *ctl_node;
+        spin_lock(&sysctl_lock);
+        ctl_node = first_usable_entry(rb_first(&dir->root));
+        spin_unlock(&sysctl_lock);
+        if (ctl_node) {
+                head = ctl_node->header;
+                entry = &head->ctl_table[ctl_node - head->node];
+        }
+        *phead = head;
+        *pentry = entry;
+}
+static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+        struct ctl_table_header *head = *phead;
+        struct ctl_table *entry = *pentry;
+        struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
+        spin_lock(&sysctl_lock);
+        unuse_table(head);
+        ctl_node = first_usable_entry(rb_next(&ctl_node->node));
+        spin_unlock(&sysctl_lock);
+        head = NULL;
+        if (ctl_node) {
+                head = ctl_node->header;
+                entry = &head->ctl_table[ctl_node - head->node];
+        }
+        *phead = head;
+        *pentry = entry;
+}
+void register_sysctl_root(struct ctl_table_root *root)
+{
+}
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+static int test_perm(int mode, int op)
+{
+        if (!current_euid())
+                mode >>= 6;
+        else if (in_egroup_p(0))
+                mode >>= 3;
+        if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+                return 0;
+        return -EACCES;
+}
+static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+{
+        int mode;
+        if (root->permissions)
+                mode = root->permissions(root, current->nsproxy, table);
+        else
+                mode = table->mode;
+        return test_perm(mode, op);
+}
 static struct inode *proc_sys_make_inode(struct super_block *sb,
                struct ctl_table_header *head, struct ctl_table *table)
 {
@@ -45,13 +411,12 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_mode = table->mode;
-        if (!table->child) {
+        if (!S_ISDIR(table->mode)) {
                inode->i_mode |= S_IFREG;
                inode->i_op = &proc_sys_inode_operations;
                inode->i_fop = &proc_sys_file_operations;
        } else {
                inode->i_mode |= S_IFDIR;
-                clear_nlink(inode);
                inode->i_op = &proc_sys_dir_operations;
                inode->i_fop = &proc_sys_dir_file_operations;
        }
@@ -59,70 +424,42 @@ out:
        return inode;
 }
-static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
-{
-        int len;
-        for ( ; p->procname; p++) {
-                if (!p->procname)
-                        continue;
-                len = strlen(p->procname);
-                if (len != name->len)
-                        continue;
-                if (memcmp(p->procname, name->name, len) != 0)
-                        continue;
-                /* I have a match */
-                return p;
-        }
-        return NULL;
-}
 static struct ctl_table_header *grab_header(struct inode *inode)
 {
-        if (PROC_I(inode)->sysctl)
+        struct ctl_table_header *head = PROC_I(inode)->sysctl;
-                return sysctl_head_grab(PROC_I(inode)->sysctl);
+        if (!head)
-        else
+                head = &sysctl_table_root.default_set.dir.header;
-                return sysctl_head_next(NULL);
+        return sysctl_head_grab(head);
 }
 static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                                        struct nameidata *nd)
 {
        struct ctl_table_header *head = grab_header(dir);
-        struct ctl_table *table = PROC_I(dir)->sysctl_entry;
        struct ctl_table_header *h = NULL;
        struct qstr *name = &dentry->d_name;
        struct ctl_table *p;
        struct inode *inode;
        struct dentry *err = ERR_PTR(-ENOENT);
+        struct ctl_dir *ctl_dir;
+        int ret;
        if (IS_ERR(head))
                return ERR_CAST(head);
-        if (table && !table->child) {
+        ctl_dir = container_of(head, struct ctl_dir, header);
-                WARN_ON(1);
-                goto out;
-        }
-        table = table ? table->child : head->ctl_table;
-        p = find_in_table(table, name);
-        if (!p) {
-                for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
-                        if (h->attached_to != table)
-                                continue;
-                        p = find_in_table(h->attached_by, name);
-                        if (p)
-                                break;
-                }
-        }
+        p = lookup_entry(&h, ctl_dir, name->name, name->len);
        if (!p)
                goto out;
+        if (S_ISLNK(p->mode)) {
+                ret = sysctl_follow_link(&h, &p, current->nsproxy);
+                err = ERR_PTR(ret);
+                if (ret)
+                        goto out;
+        }
        err = ERR_PTR(-ENOMEM);
        inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
        if (h)
@@ -190,20 +527,32 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 static int proc_sys_open(struct inode *inode, struct file *filp)
 {
+        struct ctl_table_header *head = grab_header(inode);
        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+        /* sysctl was unregistered */
+        if (IS_ERR(head))
+                return PTR_ERR(head);
        if (table->poll)
                filp->private_data = proc_sys_poll_event(table->poll);
+        sysctl_head_finish(head);
        return 0;
 }
 static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct ctl_table_header *head = grab_header(inode);
        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
-        unsigned long event = (unsigned long)filp->private_data;
        unsigned int ret = DEFAULT_POLLMASK;
+        unsigned long event;
+        /* sysctl was unregistered */
+        if (IS_ERR(head))
+                return POLLERR | POLLHUP;
        if (!table->proc_handler)
                goto out;
@@ -211,6 +560,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
        if (!table->poll)
                goto out;
+        event = (unsigned long)filp->private_data;
        poll_wait(filp, &table->poll->wait, wait);
        if (event != atomic_read(&table->poll->event)) {
@@ -219,6 +569,8 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
        }
 out:
+        sysctl_head_finish(head);
        return ret;
 }
@@ -260,28 +612,45 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
        return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
 }
+static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
+                                    filldir_t filldir,
+                                    struct ctl_table_header *head,
+                                    struct ctl_table *table)
+{
+        int err, ret = 0;
+        head = sysctl_head_grab(head);
+        if (S_ISLNK(table->mode)) {
+                /* It is not an error if we can not follow the link ignore it */
+                err = sysctl_follow_link(&head, &table, current->nsproxy);
+                if (err)
+                        goto out;
+        }
+        ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+out:
+        sysctl_head_finish(head);
+        return ret;
+}
 static int scan(struct ctl_table_header *head, ctl_table *table,
                unsigned long *pos, struct file *file,
                void *dirent, filldir_t filldir)
 {
+        int res;
-        for (; table->procname; table++, (*pos)++) {
+        if ((*pos)++ < file->f_pos)
-                int res;
+                return 0;
-                /* Can't do anything without a proc name */
-                if (!table->procname)
-                        continue;
-                if (*pos < file->f_pos)
-                        continue;
+        if (unlikely(S_ISLNK(table->mode)))
+                res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+        else
                res = proc_sys_fill_cache(file, dirent, filldir, head, table);
-                if (res)
-                        return res;
-                file->f_pos = *pos + 1;
+        if (res == 0)
-        }
+                file->f_pos = *pos;
-        return 0;
+        return res;
 }
 static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -289,20 +658,16 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct ctl_table_header *head = grab_header(inode);
-        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
        struct ctl_table_header *h = NULL;
+        struct ctl_table *entry;
+        struct ctl_dir *ctl_dir;
        unsigned long pos;
        int ret = -EINVAL;
        if (IS_ERR(head))
                return PTR_ERR(head);
-        if (table && !table->child) {
+        ctl_dir = container_of(head, struct ctl_dir, header);
-                WARN_ON(1);
-                goto out;
-        }
-        table = table ? table->child : head->ctl_table;
        ret = 0;
        /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -320,14 +685,8 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
        }
        pos = 2;
-        ret = scan(head, table, &pos, filp, dirent, filldir);
+        for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
-        if (ret)
+                ret = scan(h, entry, &pos, filp, dirent, filldir);
-                goto out;
-        for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
-                if (h->attached_to != table)
-                        continue;
-                ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
                if (ret) {
                        sysctl_head_finish(h);
                        break;
@@ -447,6 +806,21 @@ static int proc_sys_delete(const struct dentry *dentry)
        return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
+static int sysctl_is_seen(struct ctl_table_header *p)
+{
+        struct ctl_table_set *set = p->set;
+        int res;
+        spin_lock(&sysctl_lock);
+        if (p->unregistering)
+                res = 0;
+        else if (!set->is_seen)
+                res = 1;
+        else
+                res = set->is_seen(set);
+        spin_unlock(&sysctl_lock);
+        return res;
+}
 static int proc_sys_compare(const struct dentry *parent,
                const struct inode *pinode,
                const struct dentry *dentry, const struct inode *inode,
@@ -472,6 +846,753 @@ static const struct dentry_operations proc_sys_dentry_operations = {
        .d_compare      = proc_sys_compare,
 };
+static struct ctl_dir *find_subdir(struct ctl_dir *dir,
+                                   const char *name, int namelen)
+{
+        struct ctl_table_header *head;
+        struct ctl_table *entry;
+        entry = find_entry(&head, dir, name, namelen);
+        if (!entry)
+                return ERR_PTR(-ENOENT);
+        if (!S_ISDIR(entry->mode))
+                return ERR_PTR(-ENOTDIR);
+        return container_of(head, struct ctl_dir, header);
+}
+static struct ctl_dir *new_dir(struct ctl_table_set *set,
+                               const char *name, int namelen)
+{
+        struct ctl_table *table;
+        struct ctl_dir *new;
+        struct ctl_node *node;
+        char *new_name;
+        new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
+                      sizeof(struct ctl_table)*2 +  namelen + 1,
+                      GFP_KERNEL);
+        if (!new)
+                return NULL;
+        node = (struct ctl_node *)(new + 1);
+        table = (struct ctl_table *)(node + 1);
+        new_name = (char *)(table + 2);
+        memcpy(new_name, name, namelen);
+        new_name[namelen] = '\0';
+        table[0].procname = new_name;
+        table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
+        init_header(&new->header, set->dir.header.root, set, node, table);
+        return new;
+}
+/**
+ * get_subdir - find or create a subdir with the specified name.
+ * @dir:  Directory to create the subdirectory in
+ * @name: The name of the subdirectory to find or create
+ * @namelen: The length of name
+ *
+ * Takes a directory with an elevated reference count so we know that
+ * if we drop the lock the directory will not go away.  Upon success
+ * the reference is moved from @dir to the returned subdirectory.
+ * Upon error an error code is returned and the reference on @dir is
+ * simply dropped.
+ */
+static struct ctl_dir *get_subdir(struct ctl_dir *dir,
+                                  const char *name, int namelen)
+{
+        struct ctl_table_set *set = dir->header.set;
+        struct ctl_dir *subdir, *new = NULL;
+        int err;
+        spin_lock(&sysctl_lock);
+        subdir = find_subdir(dir, name, namelen);
+        if (!IS_ERR(subdir))
+                goto found;
+        if (PTR_ERR(subdir) != -ENOENT)
+                goto failed;
+        spin_unlock(&sysctl_lock);
+        new = new_dir(set, name, namelen);
+        spin_lock(&sysctl_lock);
+        subdir = ERR_PTR(-ENOMEM);
+        if (!new)
+                goto failed;
+        /* Was the subdir added while we dropped the lock? */
+        subdir = find_subdir(dir, name, namelen);
+        if (!IS_ERR(subdir))
+                goto found;
+        if (PTR_ERR(subdir) != -ENOENT)
+                goto failed;
+        /* Nope.  Use the our freshly made directory entry. */
+        err = insert_header(dir, &new->header);
+        subdir = ERR_PTR(err);
+        if (err)
+                goto failed;
+        subdir = new;
+found:
+        subdir->header.nreg++;
+failed:
+        if (unlikely(IS_ERR(subdir))) {
+                printk(KERN_ERR "sysctl could not get directory: ");
+                sysctl_print_dir(dir);
+                printk(KERN_CONT "/%*.*s %ld\n",
+                        namelen, namelen, name, PTR_ERR(subdir));
+        }
+        drop_sysctl_table(&dir->header);
+        if (new)
+                drop_sysctl_table(&new->header);
+        spin_unlock(&sysctl_lock);
+        return subdir;
+}
+static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
+{
+        struct ctl_dir *parent;
+        const char *procname;
+        if (!dir->header.parent)
+                return &set->dir;
+        parent = xlate_dir(set, dir->header.parent);
+        if (IS_ERR(parent))
+                return parent;
+        procname = dir->header.ctl_table[0].procname;
+        return find_subdir(parent, procname, strlen(procname));
+}
+static int sysctl_follow_link(struct ctl_table_header **phead,
+        struct ctl_table **pentry, struct nsproxy *namespaces)
+{
+        struct ctl_table_header *head;
+        struct ctl_table_root *root;
+        struct ctl_table_set *set;
+        struct ctl_table *entry;
+        struct ctl_dir *dir;
+        int ret;
+        ret = 0;
+        spin_lock(&sysctl_lock);
+        root = (*pentry)->data;
+        set = lookup_header_set(root, namespaces);
+        dir = xlate_dir(set, (*phead)->parent);
+        if (IS_ERR(dir))
+                ret = PTR_ERR(dir);
+        else {
+                const char *procname = (*pentry)->procname;
+                head = NULL;
+                entry = find_entry(&head, dir, procname, strlen(procname));
+                ret = -ENOENT;
+                if (entry && use_table(head)) {
+                        unuse_table(*phead);
+                        *phead = head;
+                        *pentry = entry;
+                        ret = 0;
+                }
+        }
+        spin_unlock(&sysctl_lock);
+        return ret;
+}
+static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
+                path, table->procname, &vaf);
+        va_end(args);
+        return -EINVAL;
+}
+static int sysctl_check_table(const char *path, struct ctl_table *table)
+{
+        int err = 0;
+        for (; table->procname; table++) {
+                if (table->child)
+                        err = sysctl_err(path, table, "Not a file");
+                if ((table->proc_handler == proc_dostring) ||
+                    (table->proc_handler == proc_dointvec) ||
+                    (table->proc_handler == proc_dointvec_minmax) ||
+                    (table->proc_handler == proc_dointvec_jiffies) ||
+                    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+                    (table->proc_handler == proc_dointvec_ms_jiffies) ||
+                    (table->proc_handler == proc_doulongvec_minmax) ||
+                    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+                        if (!table->data)
+                                err = sysctl_err(path, table, "No data");
+                        if (!table->maxlen)
+                                err = sysctl_err(path, table, "No maxlen");
+                }
+                if (!table->proc_handler)
+                        err = sysctl_err(path, table, "No proc_handler");
+                if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
+                        err = sysctl_err(path, table, "bogus .mode 0%o",
+                                table->mode);
+        }
+        return err;
+}
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
+        struct ctl_table_root *link_root)
+{
+        struct ctl_table *link_table, *entry, *link;
+        struct ctl_table_header *links;
+        struct ctl_node *node;
+        char *link_name;
+        int nr_entries, name_bytes;
+        name_bytes = 0;
+        nr_entries = 0;
+        for (entry = table; entry->procname; entry++) {
+                nr_entries++;
+                name_bytes += strlen(entry->procname) + 1;
+        }
+        links = kzalloc(sizeof(struct ctl_table_header) +
+                        sizeof(struct ctl_node)*nr_entries +
+                        sizeof(struct ctl_table)*(nr_entries + 1) +
+                        name_bytes,
+                        GFP_KERNEL);
+        if (!links)
+                return NULL;
+        node = (struct ctl_node *)(links + 1);
+        link_table = (struct ctl_table *)(node + nr_entries);
+        link_name = (char *)&link_table[nr_entries + 1];
+        for (link = link_table, entry = table; entry->procname; link++, entry++) {
+                int len = strlen(entry->procname) + 1;
+                memcpy(link_name, entry->procname, len);
+                link->procname = link_name;
+                link->mode = S_IFLNK|S_IRWXUGO;
+                link->data = link_root;
+                link_name += len;
+        }
+        init_header(links, dir->header.root, dir->header.set, node, link_table);
+        links->nreg = nr_entries;
+        return links;
+}
+static bool get_links(struct ctl_dir *dir,
+        struct ctl_table *table, struct ctl_table_root *link_root)
+{
+        struct ctl_table_header *head;
+        struct ctl_table *entry, *link;
+        /* Are there links available for every entry in table? */
+        for (entry = table; entry->procname; entry++) {
+                const char *procname = entry->procname;
+                link = find_entry(&head, dir, procname, strlen(procname));
+                if (!link)
+                        return false;
+                if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
+                        continue;
+                if (S_ISLNK(link->mode) && (link->data == link_root))
+                        continue;
+                return false;
+        }
+        /* The checks passed.  Increase the registration count on the links */
+        for (entry = table; entry->procname; entry++) {
+                const char *procname = entry->procname;
+                link = find_entry(&head, dir, procname, strlen(procname));
+                head->nreg++;
+        }
+        return true;
+}
+static int insert_links(struct ctl_table_header *head)
+{
+        struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+        struct ctl_dir *core_parent = NULL;
+        struct ctl_table_header *links;
+        int err;
+        if (head->set == root_set)
+                return 0;
+        core_parent = xlate_dir(root_set, head->parent);
+        if (IS_ERR(core_parent))
+                return 0;
+        if (get_links(core_parent, head->ctl_table, head->root))
+                return 0;
+        core_parent->header.nreg++;
+        spin_unlock(&sysctl_lock);
+        links = new_links(core_parent, head->ctl_table, head->root);
+        spin_lock(&sysctl_lock);
+        err = -ENOMEM;
+        if (!links)
+                goto out;
+        err = 0;
+        if (get_links(core_parent, head->ctl_table, head->root)) {
+                kfree(links);
+                goto out;
+        }
+        err = insert_header(core_parent, links);
+        if (err)
+                kfree(links);
+out:
+        drop_sysctl_table(&core_parent->header);
+        return err;
+}
+/**
+ * __register_sysctl_table - register a leaf sysctl table
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file
+ *
+ * child - must be %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * There must be a proc_handler routine for any terminal nodes.
+ * Several default handlers are available to cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_table(
+        struct ctl_table_set *set,
+        const char *path, struct ctl_table *table)
+{
+        struct ctl_table_root *root = set->dir.header.root;
+        struct ctl_table_header *header;
+        const char *name, *nextname;
+        struct ctl_dir *dir;
+        struct ctl_table *entry;
+        struct ctl_node *node;
+        int nr_entries = 0;
+        for (entry = table; entry->procname; entry++)
+                nr_entries++;
+        header = kzalloc(sizeof(struct ctl_table_header) +
+                         sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+        if (!header)
+                return NULL;
+        node = (struct ctl_node *)(header + 1);
+        init_header(header, root, set, node, table);
+        if (sysctl_check_table(path, table))
+                goto fail;
+        spin_lock(&sysctl_lock);
+        dir = &set->dir;
+        /* Reference moved down the diretory tree get_subdir */
+        dir->header.nreg++;
+        spin_unlock(&sysctl_lock);
+        /* Find the directory for the ctl_table */
+        for (name = path; name; name = nextname) {
+                int namelen;
+                nextname = strchr(name, '/');
+                if (nextname) {
+                        namelen = nextname - name;
+                        nextname++;
+                } else {
+                        namelen = strlen(name);
+                }
+                if (namelen == 0)
+                        continue;
+                dir = get_subdir(dir, name, namelen);
+                if (IS_ERR(dir))
+                        goto fail;
+        }
+        spin_lock(&sysctl_lock);
+        if (insert_header(dir, header))
+                goto fail_put_dir_locked;
+        drop_sysctl_table(&dir->header);
+        spin_unlock(&sysctl_lock);
+        return header;
+fail_put_dir_locked:
+        drop_sysctl_table(&dir->header);
+        spin_unlock(&sysctl_lock);
+fail:
+        kfree(header);
+        dump_stack();
+        return NULL;
+}
+/**
+ * register_sysctl - register a sysctl table
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the table structure
+ *
+ * Register a sysctl table. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+{
+        return __register_sysctl_table(&sysctl_table_root.default_set,
+                                        path, table);
+}
+EXPORT_SYMBOL(register_sysctl);
+static char *append_path(const char *path, char *pos, const char *name)
+{
+        int namelen;
+        namelen = strlen(name);
+        if (((pos - path) + namelen + 2) >= PATH_MAX)
+                return NULL;
+        memcpy(pos, name, namelen);
+        pos[namelen] = '/';
+        pos[namelen + 1] = '\0';
+        pos += namelen + 1;
+        return pos;
+}
+static int count_subheaders(struct ctl_table *table)
+{
+        int has_files = 0;
+        int nr_subheaders = 0;
+        struct ctl_table *entry;
+        /* special case: no directory and empty directory */
+        if (!table || !table->procname)
+                return 1;
+        for (entry = table; entry->procname; entry++) {
+                if (entry->child)
+                        nr_subheaders += count_subheaders(entry->child);
+                else
+                        has_files = 1;
+        }
+        return nr_subheaders + has_files;
+}
+static int register_leaf_sysctl_tables(const char *path, char *pos,
+        struct ctl_table_header ***subheader, struct ctl_table_set *set,
+        struct ctl_table *table)
+{
+        struct ctl_table *ctl_table_arg = NULL;
+        struct ctl_table *entry, *files;
+        int nr_files = 0;
+        int nr_dirs = 0;
+        int err = -ENOMEM;
+        for (entry = table; entry->procname; entry++) {
+                if (entry->child)
+                        nr_dirs++;
+                else
+                        nr_files++;
+        }
+        files = table;
+        /* If there are mixed files and directories we need a new table */
+        if (nr_dirs && nr_files) {
+                struct ctl_table *new;
+                files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
+                                GFP_KERNEL);
+                if (!files)
+                        goto out;
+                ctl_table_arg = files;
+                for (new = files, entry = table; entry->procname; entry++) {
+                        if (entry->child)
+                                continue;
+                        *new = *entry;
+                        new++;
+                }
+        }
+        /* Register everything except a directory full of subdirectories */
+        if (nr_files || !nr_dirs) {
+                struct ctl_table_header *header;
+                header = __register_sysctl_table(set, path, files);
+                if (!header) {
+                        kfree(ctl_table_arg);
+                        goto out;
+                }
+                /* Remember if we need to free the file table */
+                header->ctl_table_arg = ctl_table_arg;
+                **subheader = header;
+                (*subheader)++;
+        }
+        /* Recurse into the subdirectories. */
+        for (entry = table; entry->procname; entry++) {
+                char *child_pos;
+                if (!entry->child)
+                        continue;
+                err = -ENAMETOOLONG;
+                child_pos = append_path(path, pos, entry->procname);
+                if (!child_pos)
+                        goto out;
+                err = register_leaf_sysctl_tables(path, child_pos, subheader,
+                                                  set, entry->child);
+                pos[0] = '\0';
+                if (err)
+                        goto out;
+        }
+        err = 0;
+out:
+        /* On failure our caller will unregister all registered subheaders */
+        return err;
+}
+/**
+ * __register_sysctl_paths - register a sysctl table hierarchy
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+        struct ctl_table_set *set,
+        const struct ctl_path *path, struct ctl_table *table)
+{
+        struct ctl_table *ctl_table_arg = table;
+        int nr_subheaders = count_subheaders(table);
+        struct ctl_table_header *header = NULL, **subheaders, **subheader;
+        const struct ctl_path *component;
+        char *new_path, *pos;
+        pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
+        if (!new_path)
+                return NULL;
+        pos[0] = '\0';
+        for (component = path; component->procname; component++) {
+                pos = append_path(new_path, pos, component->procname);
+                if (!pos)
+                        goto out;
+        }
+        while (table->procname && table->child && !table[1].procname) {
+                pos = append_path(new_path, pos, table->procname);
+                if (!pos)
+                        goto out;
+                table = table->child;
+        }
+        if (nr_subheaders == 1) {
+                header = __register_sysctl_table(set, new_path, table);
+                if (header)
+                        header->ctl_table_arg = ctl_table_arg;
+        } else {
+                header = kzalloc(sizeof(*header) +
+                                 sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
+                if (!header)
+                        goto out;
+                subheaders = (struct ctl_table_header **) (header + 1);
+                subheader = subheaders;
+                header->ctl_table_arg = ctl_table_arg;
+                if (register_leaf_sysctl_tables(new_path, pos, &subheader,
+                                                set, table))
+                        goto err_register_leaves;
+        }
+out:
+        kfree(new_path);
+        return header;
+err_register_leaves:
+        while (subheader > subheaders) {
+                struct ctl_table_header *subh = *(--subheader);
+                struct ctl_table *table = subh->ctl_table_arg;
+                unregister_sysctl_table(subh);
+                kfree(table);
+        }
+        kfree(header);
+        header = NULL;
+        goto out;
+}
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+                                                struct ctl_table *table)
+{
+        return __register_sysctl_paths(&sysctl_table_root.default_set,
+                                        path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+        static const struct ctl_path null_path[] = { {} };
+        return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+static void put_links(struct ctl_table_header *header)
+{
+        struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+        struct ctl_table_root *root = header->root;
+        struct ctl_dir *parent = header->parent;
+        struct ctl_dir *core_parent;
+        struct ctl_table *entry;
+        if (header->set == root_set)
+                return;
+        core_parent = xlate_dir(root_set, parent);
+        if (IS_ERR(core_parent))
+                return;
+        for (entry = header->ctl_table; entry->procname; entry++) {
+                struct ctl_table_header *link_head;
+                struct ctl_table *link;
+                const char *name = entry->procname;
+                link = find_entry(&link_head, core_parent, name, strlen(name));
+                if (link &&
+                    ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
+                     (S_ISLNK(link->mode) && (link->data == root)))) {
+                        drop_sysctl_table(link_head);
+                }
+                else {
+                        printk(KERN_ERR "sysctl link missing during unregister: ");
+                        sysctl_print_dir(parent);
+                        printk(KERN_CONT "/%s\n", name);
+                }
+        }
+}
+static void drop_sysctl_table(struct ctl_table_header *header)
+{
+        struct ctl_dir *parent = header->parent;
+        if (--header->nreg)
+                return;
+        put_links(header);
+        start_unregistering(header);
+        if (!--header->count)
+                kfree_rcu(header, rcu);
+        if (parent)
+                drop_sysctl_table(&parent->header);
+}
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+        int nr_subheaders;
+        might_sleep();
+        if (header == NULL)
+                return;
+        nr_subheaders = count_subheaders(header->ctl_table_arg);
+        if (unlikely(nr_subheaders > 1)) {
+                struct ctl_table_header **subheaders;
+                int i;
+                subheaders = (struct ctl_table_header **)(header + 1);
+                for (i = nr_subheaders -1; i >= 0; i--) {
+                        struct ctl_table_header *subh = subheaders[i];
+                        struct ctl_table *table = subh->ctl_table_arg;
+                        unregister_sysctl_table(subh);
+                        kfree(table);
+                }
+                kfree(header);
+                return;
+        }
+        spin_lock(&sysctl_lock);
+        drop_sysctl_table(header);
+        spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+void setup_sysctl_set(struct ctl_table_set *set,
+        struct ctl_table_root *root,
+        int (*is_seen)(struct ctl_table_set *))
+{
+        memset(set, 0, sizeof(*set));
+        set->is_seen = is_seen;
+        init_header(&set->dir.header, root, set, NULL, root_table);
+}
+void retire_sysctl_set(struct ctl_table_set *set)
+{
+        WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
+}
 int __init proc_sys_init(void)
 {
        struct proc_dir_entry *proc_sys_root;
@@ -480,5 +1601,6 @@ int __init proc_sys_init(void)
        proc_sys_root->proc_iops = &proc_sys_dir_operations;
        proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
        proc_sys_root->nlink = 0;
-        return 0;
+        return sysctl_init();
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 121f77cfef76..6a0c62d6e442 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -89,18 +89,19 @@ static int show_stat(struct seq_file *p, void *v)
        }
        sum += arch_irq_stat();
-        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+        seq_puts(p, "cpu ");
-                "%llu\n",
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
-                (unsigned long long)cputime64_to_clock_t(user),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
-                (unsigned long long)cputime64_to_clock_t(nice),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
-                (unsigned long long)cputime64_to_clock_t(system),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
-                (unsigned long long)cputime64_to_clock_t(idle),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
-                (unsigned long long)cputime64_to_clock_t(iowait),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
-                (unsigned long long)cputime64_to_clock_t(irq),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
-                (unsigned long long)cputime64_to_clock_t(softirq),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
-                (unsigned long long)cputime64_to_clock_t(steal),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
-                (unsigned long long)cputime64_to_clock_t(guest),
+        seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
-                (unsigned long long)cputime64_to_clock_t(guest_nice));
+        seq_putc(p, '\n');
        for_each_online_cpu(i) {
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
                user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -113,26 +114,24 @@ static int show_stat(struct seq_file *p, void *v)
                steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
                guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
                guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
-                seq_printf(p,
+                seq_printf(p, "cpu%d", i);
-                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
-                        "%llu\n",
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
-                        i,
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
-                        (unsigned long long)cputime64_to_clock_t(user),
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
-                        (unsigned long long)cputime64_to_clock_t(nice),
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
-                        (unsigned long long)cputime64_to_clock_t(system),
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
-                        (unsigned long long)cputime64_to_clock_t(idle),
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
-                        (unsigned long long)cputime64_to_clock_t(iowait),
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
-                        (unsigned long long)cputime64_to_clock_t(irq),
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
-                        (unsigned long long)cputime64_to_clock_t(softirq),
+                seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
-                        (unsigned long long)cputime64_to_clock_t(steal),
+                seq_putc(p, '\n');
-                        (unsigned long long)cputime64_to_clock_t(guest),
-                        (unsigned long long)cputime64_to_clock_t(guest_nice));
        }
        seq_printf(p, "intr %llu", (unsigned long long)sum);
        /* sum again ? it could be updated? */
        for_each_irq_nr(j)
-                seq_printf(p, " %u", kstat_irqs(j));
+                seq_put_decimal_ull(p, ' ', kstat_irqs(j));
        seq_printf(p,
                "\nctxt %llu\n"
@@ -149,7 +148,7 @@ static int show_stat(struct seq_file *p, void *v)
        seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
        for (i = 0; i < NR_SOFTIRQS; i++)
-                seq_printf(p, " %u", per_softirq_sums[i]);
+                seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
        seq_putc(p, '\n');
        return 0;
@@ -157,11 +156,14 @@ static int show_stat(struct seq_file *p, void *v)
 static int stat_open(struct inode *inode, struct file *file)
 {
-        unsigned size = 4096 * (1 + num_possible_cpus() / 32);
+        unsigned size = 1024 + 128 * num_possible_cpus();
        char *buf;
        struct seq_file *m;
        int res;
+        /* minimum size to display an interrupt count : 2 bytes */
+        size += 2 * nr_irqs;
        /* don't ask for more than the kmalloc() max size */
        if (size > KMALLOC_MAX_SIZE)
                size = KMALLOC_MAX_SIZE;
@@ -173,7 +175,7 @@ static int stat_open(struct inode *inode, struct file *file)
        if (!res) {
                m = file->private_data;
                m->buf = buf;
-                m->size = size;
+                m->size = ksize(buf);
        } else
                kfree(buf);
        return res;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 9ec22d3b4293..82c585f715e3 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -68,9 +68,25 @@ void pstore_set_kmsg_bytes(int bytes)
 /* Tag each group of saved records with a sequence number */
 static int      oopscount;
-static char *reason_str[] = {
+static const char *get_reason_str(enum kmsg_dump_reason reason)
-        "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
+{
-};
+        switch (reason) {
+        case KMSG_DUMP_PANIC:
+                return "Panic";
+        case KMSG_DUMP_OOPS:
+                return "Oops";
+        case KMSG_DUMP_EMERG:
+                return "Emergency";
+        case KMSG_DUMP_RESTART:
+                return "Restart";
+        case KMSG_DUMP_HALT:
+                return "Halt";
+        case KMSG_DUMP_POWEROFF:
+                return "Poweroff";
+        default:
+                return "Unknown";
+        }
+}
 /*
 * callback from kmsg_dump. (s2,l2) has the most recently
@@ -85,17 +101,15 @@ static void pstore_dump(struct kmsg_dumper *dumper,
        unsigned long   s1_start, s2_start;
        unsigned long   l1_cpy, l2_cpy;
        unsigned long   size, total = 0;
-        char            *dst, *why;
+        char            *dst;
+        const char      *why;
        u64             id;
        int             hsize, ret;
        unsigned int    part = 1;
        unsigned long   flags = 0;
        int             is_locked = 0;
-        if (reason < ARRAY_SIZE(reason_str))
+        why = get_reason_str(reason);
-                why = reason_str[reason];
-        else
-                why = "Unknown";
        if (in_nmi()) {
                is_locked = spin_trylock(&psinfo->buf_lock);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8b4f12b33f57..d69a1d1d7e15 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1110,6 +1110,13 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number)
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
+struct dquot_warn {
+        struct super_block *w_sb;
+        qid_t w_dq_id;
+        short w_dq_type;
+        short w_type;
+};
 static int warning_issued(struct dquot *dquot, const int warntype)
 {
        int flag = (warntype == QUOTA_NL_BHARDWARN ||
@@ -1125,41 +1132,42 @@ static int warning_issued(struct dquot *dquot, const int warntype)
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 static int flag_print_warnings = 1;
-static int need_print_warning(struct dquot *dquot)
+static int need_print_warning(struct dquot_warn *warn)
 {
        if (!flag_print_warnings)
                return 0;
-        switch (dquot->dq_type) {
+        switch (warn->w_dq_type) {
                case USRQUOTA:
-                        return current_fsuid() == dquot->dq_id;
+                        return current_fsuid() == warn->w_dq_id;
                case GRPQUOTA:
-                        return in_group_p(dquot->dq_id);
+                        return in_group_p(warn->w_dq_id);
        }
        return 0;
 }
 /* Print warning to user which exceeded quota */
-static void print_warning(struct dquot *dquot, const int warntype)
+static void print_warning(struct dquot_warn *warn)
 {
        char *msg = NULL;
        struct tty_struct *tty;
+        int warntype = warn->w_type;
        if (warntype == QUOTA_NL_IHARDBELOW ||
            warntype == QUOTA_NL_ISOFTBELOW ||
            warntype == QUOTA_NL_BHARDBELOW ||
-            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
+            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
                return;
        tty = get_current_tty();
        if (!tty)
                return;
-        tty_write_message(tty, dquot->dq_sb->s_id);
+        tty_write_message(tty, warn->w_sb->s_id);
        if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
                tty_write_message(tty, ": warning, ");
        else
                tty_write_message(tty, ": write failed, ");
-        tty_write_message(tty, quotatypes[dquot->dq_type]);
+        tty_write_message(tty, quotatypes[warn->w_dq_type]);
        switch (warntype) {
                case QUOTA_NL_IHARDWARN:
                        msg = " file limit reached.\r\n";
@@ -1185,26 +1193,34 @@ static void print_warning(struct dquot *dquot, const int warntype)
 }
 #endif
+static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
+                            int warntype)
+{
+        if (warning_issued(dquot, warntype))
+                return;
+        warn->w_type = warntype;
+        warn->w_sb = dquot->dq_sb;
+        warn->w_dq_id = dquot->dq_id;
+        warn->w_dq_type = dquot->dq_type;
+}
 /*
 * Write warnings to the console and send warning messages over netlink.
 *
- * Note that this function can sleep.
+ * Note that this function can call into tty and networking code.
 */
-static void flush_warnings(struct dquot *const *dquots, char *warntype)
+static void flush_warnings(struct dquot_warn *warn)
 {
-        struct dquot *dq;
        int i;
        for (i = 0; i < MAXQUOTAS; i++) {
-                dq = dquots[i];
+                if (warn[i].w_type == QUOTA_NL_NOWARN)
-                if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+                        continue;
-                    !warning_issued(dq, warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
-                        print_warning(dq, warntype[i]);
+                print_warning(&warn[i]);
 #endif
-                        quota_send_warning(dq->dq_type, dq->dq_id,
+                quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id,
-                                           dq->dq_sb->s_dev, warntype[i]);
+                                   warn[i].w_sb->s_dev, warn[i].w_type);
-                }
        }
 }
@@ -1218,11 +1234,11 @@ static int ignore_hardlimit(struct dquot *dquot)
 }
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes,
+                     struct dquot_warn *warn)
 {
        qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
-        *warntype = QUOTA_NL_NOWARN;
        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return 0;
@@ -1230,7 +1246,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
        if (dquot->dq_dqb.dqb_ihardlimit &&
            newinodes > dquot->dq_dqb.dqb_ihardlimit &&
            !ignore_hardlimit(dquot)) {
-                *warntype = QUOTA_NL_IHARDWARN;
+                prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
                return -EDQUOT;
        }
@@ -1239,14 +1255,14 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
            dquot->dq_dqb.dqb_itime &&
            get_seconds() >= dquot->dq_dqb.dqb_itime &&
            !ignore_hardlimit(dquot)) {
-                *warntype = QUOTA_NL_ISOFTLONGWARN;
+                prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
                return -EDQUOT;
        }
        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime == 0) {
-                *warntype = QUOTA_NL_ISOFTWARN;
+                prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
                dquot->dq_dqb.dqb_itime = get_seconds() +
                    sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
        }
@@ -1255,12 +1271,12 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 }
 /* needs dq_data_lock */
-static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
+static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
+                     struct dquot_warn *warn)
 {
        qsize_t tspace;
        struct super_block *sb = dquot->dq_sb;
-        *warntype = QUOTA_NL_NOWARN;
        if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return 0;
@@ -1272,7 +1288,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
            tspace > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
-                        *warntype = QUOTA_NL_BHARDWARN;
+                        prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
                return -EDQUOT;
        }
@@ -1282,7 +1298,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
            get_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
-                        *warntype = QUOTA_NL_BSOFTLONGWARN;
+                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
                return -EDQUOT;
        }
@@ -1290,7 +1306,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (!prealloc) {
-                        *warntype = QUOTA_NL_BSOFTWARN;
+                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
                        dquot->dq_dqb.dqb_btime = get_seconds() +
                            sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;
                }
@@ -1543,10 +1559,9 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 {
        int cnt, ret = 0;
-        char warntype[MAXQUOTAS];
+        struct dquot_warn warn[MAXQUOTAS];
-        int warn = flags & DQUOT_SPACE_WARN;
+        struct dquot **dquots = inode->i_dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE;
-        int nofail = flags & DQUOT_SPACE_NOFAIL;
        /*
         * First test before acquiring mutex - solves deadlocks when we
@@ -1559,36 +1574,36 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                warntype[cnt] = QUOTA_NL_NOWARN;
+                warn[cnt].w_type = QUOTA_NL_NOWARN;
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (!inode->i_dquot[cnt])
+                if (!dquots[cnt])
                        continue;
-                ret = check_bdq(inode->i_dquot[cnt], number, !warn,
+                ret = check_bdq(dquots[cnt], number,
-                                warntype+cnt);
+                                !(flags & DQUOT_SPACE_WARN), &warn[cnt]);
-                if (ret && !nofail) {
+                if (ret && !(flags & DQUOT_SPACE_NOFAIL)) {
                        spin_unlock(&dq_data_lock);
                        goto out_flush_warn;
                }
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (!inode->i_dquot[cnt])
+                if (!dquots[cnt])
                        continue;
                if (reserve)
-                        dquot_resv_space(inode->i_dquot[cnt], number);
+                        dquot_resv_space(dquots[cnt], number);
                else
-                        dquot_incr_space(inode->i_dquot[cnt], number);
+                        dquot_incr_space(dquots[cnt], number);
        }
        inode_incr_space(inode, number, reserve);
        spin_unlock(&dq_data_lock);
        if (reserve)
                goto out_flush_warn;
-        mark_all_dquot_dirty(inode->i_dquot);
+        mark_all_dquot_dirty(dquots);
 out_flush_warn:
-        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        flush_warnings(warn);
 out:
        return ret;
 }
@@ -1600,36 +1615,37 @@ EXPORT_SYMBOL(__dquot_alloc_space);
 int dquot_alloc_inode(const struct inode *inode)
 {
        int cnt, ret = 0;
-        char warntype[MAXQUOTAS];
+        struct dquot_warn warn[MAXQUOTAS];
+        struct dquot * const *dquots = inode->i_dquot;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                warntype[cnt] = QUOTA_NL_NOWARN;
+                warn[cnt].w_type = QUOTA_NL_NOWARN;
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (!inode->i_dquot[cnt])
+                if (!dquots[cnt])
                        continue;
-                ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
+                ret = check_idq(dquots[cnt], 1, &warn[cnt]);
                if (ret)
                        goto warn_put_all;
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (!inode->i_dquot[cnt])
+                if (!dquots[cnt])
                        continue;
-                dquot_incr_inodes(inode->i_dquot[cnt], 1);
+                dquot_incr_inodes(dquots[cnt], 1);
        }
 warn_put_all:
        spin_unlock(&dq_data_lock);
        if (ret == 0)
-                mark_all_dquot_dirty(inode->i_dquot);
+                mark_all_dquot_dirty(dquots);
-        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        flush_warnings(warn);
        return ret;
 }
 EXPORT_SYMBOL(dquot_alloc_inode);
@@ -1669,7 +1685,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
        unsigned int cnt;
-        char warntype[MAXQUOTAS];
+        struct dquot_warn warn[MAXQUOTAS];
+        struct dquot **dquots = inode->i_dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE;
        /* First test before acquiring mutex - solves deadlocks when we
@@ -1682,23 +1699,28 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (!inode->i_dquot[cnt])
+                int wtype;
+                warn[cnt].w_type = QUOTA_NL_NOWARN;
+                if (!dquots[cnt])
                        continue;
-                warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
+                wtype = info_bdq_free(dquots[cnt], number);
+                if (wtype != QUOTA_NL_NOWARN)
+                        prepare_warning(&warn[cnt], dquots[cnt], wtype);
                if (reserve)
-                        dquot_free_reserved_space(inode->i_dquot[cnt], number);
+                        dquot_free_reserved_space(dquots[cnt], number);
                else
-                        dquot_decr_space(inode->i_dquot[cnt], number);
+                        dquot_decr_space(dquots[cnt], number);
        }
        inode_decr_space(inode, number, reserve);
        spin_unlock(&dq_data_lock);
        if (reserve)
                goto out_unlock;
-        mark_all_dquot_dirty(inode->i_dquot);
+        mark_all_dquot_dirty(dquots);
 out_unlock:
-        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        flush_warnings(warn);
 }
 EXPORT_SYMBOL(__dquot_free_space);
@@ -1708,7 +1730,8 @@ EXPORT_SYMBOL(__dquot_free_space);
 void dquot_free_inode(const struct inode *inode)
 {
        unsigned int cnt;
-        char warntype[MAXQUOTAS];
+        struct dquot_warn warn[MAXQUOTAS];
+        struct dquot * const *dquots = inode->i_dquot;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
@@ -1718,15 +1741,20 @@ void dquot_free_inode(const struct inode *inode)
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (!inode->i_dquot[cnt])
+                int wtype;
+                warn[cnt].w_type = QUOTA_NL_NOWARN;
+                if (!dquots[cnt])
                        continue;
-                warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
+                wtype = info_idq_free(dquots[cnt], 1);
-                dquot_decr_inodes(inode->i_dquot[cnt], 1);
+                if (wtype != QUOTA_NL_NOWARN)
+                        prepare_warning(&warn[cnt], dquots[cnt], wtype);
+                dquot_decr_inodes(dquots[cnt], 1);
        }
        spin_unlock(&dq_data_lock);
-        mark_all_dquot_dirty(inode->i_dquot);
+        mark_all_dquot_dirty(dquots);
-        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        flush_warnings(warn);
 }
 EXPORT_SYMBOL(dquot_free_inode);
@@ -1747,16 +1775,20 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, ret = 0;
        char is_valid[MAXQUOTAS] = {};
-        char warntype_to[MAXQUOTAS];
+        struct dquot_warn warn_to[MAXQUOTAS];
-        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
+        struct dquot_warn warn_from_inodes[MAXQUOTAS];
+        struct dquot_warn warn_from_space[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode))
                return 0;
        /* Initialize the arrays */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                warntype_to[cnt] = QUOTA_NL_NOWARN;
+                warn_to[cnt].w_type = QUOTA_NL_NOWARN;
+                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
+                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
+        }
        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1778,10 +1810,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
                        continue;
                is_valid[cnt] = 1;
                transfer_from[cnt] = inode->i_dquot[cnt];
-                ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
+                ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
                if (ret)
                        goto over_quota;
-                ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
+                ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
                if (ret)
                        goto over_quota;
        }
@@ -1794,10 +1826,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
-                        warntype_from_inodes[cnt] =
+                        int wtype;
-                                info_idq_free(transfer_from[cnt], 1);
+                        wtype = info_idq_free(transfer_from[cnt], 1);
-                        warntype_from_space[cnt] =
+                        if (wtype != QUOTA_NL_NOWARN)
-                                info_bdq_free(transfer_from[cnt], space);
+                                prepare_warning(&warn_from_inodes[cnt],
+                                                transfer_from[cnt], wtype);
+                        wtype = info_bdq_free(transfer_from[cnt], space);
+                        if (wtype != QUOTA_NL_NOWARN)
+                                prepare_warning(&warn_from_space[cnt],
+                                                transfer_from[cnt], wtype);
                        dquot_decr_inodes(transfer_from[cnt], 1);
                        dquot_decr_space(transfer_from[cnt], cur_space);
                        dquot_free_reserved_space(transfer_from[cnt],
@@ -1815,9 +1852,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
-        flush_warnings(transfer_to, warntype_to);
+        flush_warnings(warn_to);
-        flush_warnings(transfer_from, warntype_from_inodes);
+        flush_warnings(warn_from_inodes);
-        flush_warnings(transfer_from, warntype_from_space);
+        flush_warnings(warn_from_space);
        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (is_valid[cnt])
@@ -1826,7 +1863,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 over_quota:
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        flush_warnings(transfer_to, warntype_to);
+        flush_warnings(warn_to);
        return ret;
 }
 EXPORT_SYMBOL(__dquot_transfer);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index fc2c4388d126..9a391204ca27 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -282,10 +282,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_XGETQUOTA:
                return quota_getxquota(sb, type, id, addr);
        case Q_XQUOTASYNC:
-                /* caller already holds s_umount */
                if (sb->s_flags & MS_RDONLY)
                        return -EROFS;
-                writeback_inodes_sb(sb, WB_REASON_SYNC);
+                /* XFS quotas are fully coherent now, making this call a noop */
                return 0;
        default:
                return -EINVAL;
diff --git a/fs/read_write.c b/fs/read_write.c
index 5ad4248b0cd8..ffc99d22e0a3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -11,7 +11,7 @@
 #include <linux/uio.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
diff --git a/fs/readdir.c b/fs/readdir.c
index 356f71528ad6..cc0a8227cddf 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -6,7 +6,7 @@
 #include <linux/stddef.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 445d768eea44..a59d27126338 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/bug.h>
 #include <linux/workqueue.h>
 #include <asm/unaligned.h>
 #include <linux/bitops.h>
diff --git a/fs/select.c b/fs/select.c
index e782258d0de3..6fb8943d580b 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -17,7 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
@@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
        get_file(filp);
        entry->filp = filp;
        entry->wait_address = wait_address;
-        entry->key = p->key;
+        entry->key = p->_key;
        init_waitqueue_func_entry(&entry->wait, pollwake);
        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
@@ -386,13 +386,11 @@ get_max:
 static inline void wait_key_set(poll_table *wait, unsigned long in,
                                unsigned long out, unsigned long bit)
 {
-        if (wait) {
+        wait->_key = POLLEX_SET;
-                wait->key = POLLEX_SET;
+        if (in & bit)
-                if (in & bit)
+                wait->_key |= POLLIN_SET;
-                        wait->key |= POLLIN_SET;
+        if (out & bit)
-                if (out & bit)
+                wait->_key |= POLLOUT_SET;
-                        wait->key |= POLLOUT_SET;
-        }
 }
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
@@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        poll_initwait(&table);
        wait = &table.pt;
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
-                wait = NULL;
+                wait->_qproc = NULL;
                timed_out = 1;
        }
@@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                                        if ((mask & POLLIN_SET) && (in & bit)) {
                                                res_in |= bit;
                                                retval++;
-                                                wait = NULL;
+                                                wait->_qproc = NULL;
                                        }
                                        if ((mask & POLLOUT_SET) && (out & bit)) {
                                                res_out |= bit;
                                                retval++;
-                                                wait = NULL;
+                                                wait->_qproc = NULL;
                                        }
                                        if ((mask & POLLEX_SET) && (ex & bit)) {
                                                res_ex |= bit;
                                                retval++;
-                                                wait = NULL;
+                                                wait->_qproc = NULL;
                                        }
                                }
                        }
@@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                                *rexp = res_ex;
                        cond_resched();
                }
-                wait = NULL;
+                wait->_qproc = NULL;
                if (retval || timed_out || signal_pending(current))
                        break;
                if (table.error) {
@@ -720,7 +718,7 @@ struct poll_list {
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
- * if non-NULL.
+ * if pwait->_qproc is non-NULL.
 */
 static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 {
@@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
                if (file != NULL) {
                        mask = DEFAULT_POLLMASK;
                        if (file->f_op && file->f_op->poll) {
-                                if (pwait)
+                                pwait->_key = pollfd->events|POLLERR|POLLHUP;
-                                        pwait->key = pollfd->events |
-                                                        POLLERR | POLLHUP;
                                mask = file->f_op->poll(file, pwait);
                        }
                        /* Mask out unneeded events. */
@@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
-                pt = NULL;
+                pt->_qproc = NULL;
                timed_out = 1;
        }
@@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
                        for (; pfd != pfd_end; pfd++) {
                                /*
                                 * Fish for events. If we found one, record it
-                                 * and kill the poll_table, so we don't
+                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt)) {
                                        count++;
-                                        pt = NULL;
+                                        pt->_qproc = NULL;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
-                 * a poll_table to them on the next loop iteration.
+                 * a poll_table->_qproc to them on the next loop iteration.
                 */
-                pt = NULL;
+                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
diff --git a/fs/seq_file.c b/fs/seq_file.c
index aa242dc99373..0cbd0494b79e 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -6,13 +6,29 @@
 */
 #include <linux/fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
+/*
+ * seq_files have a buffer which can may overflow. When this happens a larger
+ * buffer is reallocated and all the data will be printed again.
+ * The overflow state is true when m->count == m->size.
+ */
+static bool seq_overflow(struct seq_file *m)
+{
+        return m->count == m->size;
+}
+static void seq_set_overflow(struct seq_file *m)
+{
+        m->count = m->size;
+}
 /**
 *      seq_open -      initialize sequential file
 *      @file: file we initialize
@@ -92,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset)
                        error = 0;
                        m->count = 0;
                }
-                if (m->count == m->size)
+                if (seq_overflow(m))
                        goto Eoverflow;
                if (pos + m->count > offset) {
                        m->from = offset - pos;
@@ -234,7 +250,7 @@ Fill:
                        break;
                }
                err = m->op->show(m, p);
-                if (m->count == m->size || err) {
+                if (seq_overflow(m) || err) {
                        m->count = offs;
                        if (likely(err <= 0))
                                break;
@@ -361,7 +377,7 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
                        *p++ = '0' + (c & 07);
                        continue;
                }
-                m->count = m->size;
+                seq_set_overflow(m);
                return -1;
        }
        m->count = p - m->buf;
@@ -383,7 +399,7 @@ int seq_printf(struct seq_file *m, const char *f, ...)
                        return 0;
                }
        }
-        m->count = m->size;
+        seq_set_overflow(m);
        return -1;
 }
 EXPORT_SYMBOL(seq_printf);
@@ -512,7 +528,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
                        return 0;
                }
        }
-        m->count = m->size;
+        seq_set_overflow(m);
        return -1;
 }
 EXPORT_SYMBOL(seq_bitmap);
@@ -528,7 +544,7 @@ int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
                        return 0;
                }
        }
-        m->count = m->size;
+        seq_set_overflow(m);
        return -1;
 }
 EXPORT_SYMBOL(seq_bitmap_list);
@@ -639,11 +655,63 @@ int seq_puts(struct seq_file *m, const char *s)
                m->count += len;
                return 0;
        }
-        m->count = m->size;
+        seq_set_overflow(m);
        return -1;
 }
 EXPORT_SYMBOL(seq_puts);
+/*
+ * A helper routine for putting decimal numbers without rich format of printf().
+ * only 'unsigned long long' is supported.
+ * This routine will put one byte delimiter + number into seq_file.
+ * This routine is very quick when you show lots of numbers.
+ * In usual cases, it will be better to use seq_printf(). It's easier to read.
+ */
+int seq_put_decimal_ull(struct seq_file *m, char delimiter,
+                        unsigned long long num)
+{
+        int len;
+        if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
+                goto overflow;
+        if (delimiter)
+                m->buf[m->count++] = delimiter;
+        if (num < 10) {
+                m->buf[m->count++] = num + '0';
+                return 0;
+        }
+        len = num_to_str(m->buf + m->count, m->size - m->count, num);
+        if (!len)
+                goto overflow;
+        m->count += len;
+        return 0;
+overflow:
+        seq_set_overflow(m);
+        return -1;
+}
+EXPORT_SYMBOL(seq_put_decimal_ull);
+int seq_put_decimal_ll(struct seq_file *m, char delimiter,
+                        long long num)
+{
+        if (num < 0) {
+                if (m->count + 3 >= m->size) {
+                        seq_set_overflow(m);
+                        return -1;
+                }
+                if (delimiter)
+                        m->buf[m->count++] = delimiter;
+                num = -num;
+                delimiter = '-';
+        }
+        return seq_put_decimal_ull(m, delimiter, num);
+}
+EXPORT_SYMBOL(seq_put_decimal_ll);
 /**
 * seq_write - write arbitrary data to buffer
 * @seq: seq_file identifying the buffer to which data should be written
@@ -659,7 +727,7 @@ int seq_write(struct seq_file *seq, const void *data, size_t len)
                seq->count += len;
                return 0;
        }
-        seq->count = seq->size;
+        seq_set_overflow(seq);
        return -1;
 }
 EXPORT_SYMBOL(seq_write);
diff --git a/fs/splice.c b/fs/splice.c
index f16402ed915c..5f883de7ef3a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -25,7 +25,7 @@
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
diff --git a/fs/stack.c b/fs/stack.c
index 9c11519245a6..5b5388250e29 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/fs.h>
 #include <linux/fs_stack.h>
diff --git a/fs/stat.c b/fs/stat.c
index 86f13563a463..c733dc5753ae 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/file.h>
diff --git a/fs/statfs.c b/fs/statfs.c
index 2aa6a22e0be2..43e6b6fe4e85 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -1,5 +1,5 @@
 #include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mount.h>
diff --git a/fs/super.c b/fs/super.c
index 7fcb1354c554..cf001775617f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -20,7 +20,7 @@
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
diff --git a/fs/sync.c b/fs/sync.c
index f3501ef39235..0e8db939d96f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -6,7 +6,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index f922cbacdb96..1934084e2088 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -36,7 +36,7 @@
 #ifdef CONFIG_UBIFS_FS_DEBUG
-DEFINE_SPINLOCK(dbg_lock);
+static DEFINE_SPINLOCK(dbg_lock);
 static const char *get_key_fmt(int fmt)
 {
@@ -221,15 +221,15 @@ const char *dbg_jhead(int jhead)
 static void dump_ch(const struct ubifs_ch *ch)
 {
-        printk(KERN_DEBUG "\tmagic          %#x\n", le32_to_cpu(ch->magic));
+        printk(KERN_ERR "\tmagic          %#x\n", le32_to_cpu(ch->magic));
-        printk(KERN_DEBUG "\tcrc            %#x\n", le32_to_cpu(ch->crc));
+        printk(KERN_ERR "\tcrc            %#x\n", le32_to_cpu(ch->crc));
-        printk(KERN_DEBUG "\tnode_type      %d (%s)\n", ch->node_type,
+        printk(KERN_ERR "\tnode_type      %d (%s)\n", ch->node_type,
               dbg_ntype(ch->node_type));
-        printk(KERN_DEBUG "\tgroup_type     %d (%s)\n", ch->group_type,
+        printk(KERN_ERR "\tgroup_type     %d (%s)\n", ch->group_type,
               dbg_gtype(ch->group_type));
-        printk(KERN_DEBUG "\tsqnum          %llu\n",
+        printk(KERN_ERR "\tsqnum          %llu\n",
               (unsigned long long)le64_to_cpu(ch->sqnum));
-        printk(KERN_DEBUG "\tlen            %u\n", le32_to_cpu(ch->len));
+        printk(KERN_ERR "\tlen            %u\n", le32_to_cpu(ch->len));
 }
 void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
@@ -240,43 +240,43 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
        struct ubifs_dent_node *dent, *pdent = NULL;
        int count = 2;
-        printk(KERN_DEBUG "Dump in-memory inode:");
+        printk(KERN_ERR "Dump in-memory inode:");
-        printk(KERN_DEBUG "\tinode          %lu\n", inode->i_ino);
+        printk(KERN_ERR "\tinode          %lu\n", inode->i_ino);
-        printk(KERN_DEBUG "\tsize           %llu\n",
+        printk(KERN_ERR "\tsize           %llu\n",
               (unsigned long long)i_size_read(inode));
-        printk(KERN_DEBUG "\tnlink          %u\n", inode->i_nlink);
+        printk(KERN_ERR "\tnlink          %u\n", inode->i_nlink);
-        printk(KERN_DEBUG "\tuid            %u\n", (unsigned int)inode->i_uid);
+        printk(KERN_ERR "\tuid            %u\n", (unsigned int)inode->i_uid);
-        printk(KERN_DEBUG "\tgid            %u\n", (unsigned int)inode->i_gid);
+        printk(KERN_ERR "\tgid            %u\n", (unsigned int)inode->i_gid);
-        printk(KERN_DEBUG "\tatime          %u.%u\n",
+        printk(KERN_ERR "\tatime          %u.%u\n",
               (unsigned int)inode->i_atime.tv_sec,
               (unsigned int)inode->i_atime.tv_nsec);
-        printk(KERN_DEBUG "\tmtime          %u.%u\n",
+        printk(KERN_ERR "\tmtime          %u.%u\n",
               (unsigned int)inode->i_mtime.tv_sec,
               (unsigned int)inode->i_mtime.tv_nsec);
-        printk(KERN_DEBUG "\tctime          %u.%u\n",
+        printk(KERN_ERR "\tctime          %u.%u\n",
               (unsigned int)inode->i_ctime.tv_sec,
               (unsigned int)inode->i_ctime.tv_nsec);
-        printk(KERN_DEBUG "\tcreat_sqnum    %llu\n", ui->creat_sqnum);
+        printk(KERN_ERR "\tcreat_sqnum    %llu\n", ui->creat_sqnum);
-        printk(KERN_DEBUG "\txattr_size     %u\n", ui->xattr_size);
+        printk(KERN_ERR "\txattr_size     %u\n", ui->xattr_size);
-        printk(KERN_DEBUG "\txattr_cnt      %u\n", ui->xattr_cnt);
+        printk(KERN_ERR "\txattr_cnt      %u\n", ui->xattr_cnt);
-        printk(KERN_DEBUG "\txattr_names    %u\n", ui->xattr_names);
+        printk(KERN_ERR "\txattr_names    %u\n", ui->xattr_names);
-        printk(KERN_DEBUG "\tdirty          %u\n", ui->dirty);
+        printk(KERN_ERR "\tdirty          %u\n", ui->dirty);
-        printk(KERN_DEBUG "\txattr          %u\n", ui->xattr);
+        printk(KERN_ERR "\txattr          %u\n", ui->xattr);
-        printk(KERN_DEBUG "\tbulk_read      %u\n", ui->xattr);
+        printk(KERN_ERR "\tbulk_read      %u\n", ui->xattr);
-        printk(KERN_DEBUG "\tsynced_i_size  %llu\n",
+        printk(KERN_ERR "\tsynced_i_size  %llu\n",
               (unsigned long long)ui->synced_i_size);
-        printk(KERN_DEBUG "\tui_size        %llu\n",
+        printk(KERN_ERR "\tui_size        %llu\n",
               (unsigned long long)ui->ui_size);
-        printk(KERN_DEBUG "\tflags          %d\n", ui->flags);
+        printk(KERN_ERR "\tflags          %d\n", ui->flags);
-        printk(KERN_DEBUG "\tcompr_type     %d\n", ui->compr_type);
+        printk(KERN_ERR "\tcompr_type     %d\n", ui->compr_type);
-        printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
+        printk(KERN_ERR "\tlast_page_read %lu\n", ui->last_page_read);
-        printk(KERN_DEBUG "\tread_in_a_row  %lu\n", ui->read_in_a_row);
+        printk(KERN_ERR "\tread_in_a_row  %lu\n", ui->read_in_a_row);
-        printk(KERN_DEBUG "\tdata_len       %d\n", ui->data_len);
+        printk(KERN_ERR "\tdata_len       %d\n", ui->data_len);
        if (!S_ISDIR(inode->i_mode))
                return;
-        printk(KERN_DEBUG "List of directory entries:\n");
+        printk(KERN_ERR "List of directory entries:\n");
        ubifs_assert(!mutex_is_locked(&c->tnc_mutex));
        lowest_dent_key(c, &key, inode->i_ino);
@@ -284,11 +284,11 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
                dent = ubifs_tnc_next_ent(c, &key, &nm);
                if (IS_ERR(dent)) {
                        if (PTR_ERR(dent) != -ENOENT)
-                                printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent));
+                                printk(KERN_ERR "error %ld\n", PTR_ERR(dent));
                        break;
                }
-                printk(KERN_DEBUG "\t%d: %s (%s)\n",
+                printk(KERN_ERR "\t%d: %s (%s)\n",
                       count++, dent->name, get_dent_type(dent->type));
                nm.name = dent->name;
@@ -312,8 +312,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
        /* If the magic is incorrect, just hexdump the first bytes */
        if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
-                printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
+                printk(KERN_ERR "Not a node, first %zu bytes:", UBIFS_CH_SZ);
-                print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+                print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,
                               (void *)node, UBIFS_CH_SZ, 1);
                return;
        }
@@ -326,7 +326,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
        {
                const struct ubifs_pad_node *pad = node;
-                printk(KERN_DEBUG "\tpad_len        %u\n",
+                printk(KERN_ERR "\tpad_len        %u\n",
                       le32_to_cpu(pad->pad_len));
                break;
        }
@@ -335,50 +335,50 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                const struct ubifs_sb_node *sup = node;
                unsigned int sup_flags = le32_to_cpu(sup->flags);
-                printk(KERN_DEBUG "\tkey_hash       %d (%s)\n",
+                printk(KERN_ERR "\tkey_hash       %d (%s)\n",
                       (int)sup->key_hash, get_key_hash(sup->key_hash));
-                printk(KERN_DEBUG "\tkey_fmt        %d (%s)\n",
+                printk(KERN_ERR "\tkey_fmt        %d (%s)\n",
                       (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
-                printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
+                printk(KERN_ERR "\tflags          %#x\n", sup_flags);
-                printk(KERN_DEBUG "\t  big_lpt      %u\n",
+                printk(KERN_ERR "\t  big_lpt      %u\n",
                       !!(sup_flags & UBIFS_FLG_BIGLPT));
-                printk(KERN_DEBUG "\t  space_fixup  %u\n",
+                printk(KERN_ERR "\t  space_fixup  %u\n",
                       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
-                printk(KERN_DEBUG "\tmin_io_size    %u\n",
+                printk(KERN_ERR "\tmin_io_size    %u\n",
                       le32_to_cpu(sup->min_io_size));
-                printk(KERN_DEBUG "\tleb_size       %u\n",
+                printk(KERN_ERR "\tleb_size       %u\n",
                       le32_to_cpu(sup->leb_size));
-                printk(KERN_DEBUG "\tleb_cnt        %u\n",
+                printk(KERN_ERR "\tleb_cnt        %u\n",
                       le32_to_cpu(sup->leb_cnt));
-                printk(KERN_DEBUG "\tmax_leb_cnt    %u\n",
+                printk(KERN_ERR "\tmax_leb_cnt    %u\n",
                       le32_to_cpu(sup->max_leb_cnt));
-                printk(KERN_DEBUG "\tmax_bud_bytes  %llu\n",
+                printk(KERN_ERR "\tmax_bud_bytes  %llu\n",
                       (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
-                printk(KERN_DEBUG "\tlog_lebs       %u\n",
+                printk(KERN_ERR "\tlog_lebs       %u\n",
                       le32_to_cpu(sup->log_lebs));
-                printk(KERN_DEBUG "\tlpt_lebs       %u\n",
+                printk(KERN_ERR "\tlpt_lebs       %u\n",
                       le32_to_cpu(sup->lpt_lebs));
-                printk(KERN_DEBUG "\torph_lebs      %u\n",
+                printk(KERN_ERR "\torph_lebs      %u\n",
                       le32_to_cpu(sup->orph_lebs));
-                printk(KERN_DEBUG "\tjhead_cnt      %u\n",
+                printk(KERN_ERR "\tjhead_cnt      %u\n",
                       le32_to_cpu(sup->jhead_cnt));
-                printk(KERN_DEBUG "\tfanout         %u\n",
+                printk(KERN_ERR "\tfanout         %u\n",
                       le32_to_cpu(sup->fanout));
-                printk(KERN_DEBUG "\tlsave_cnt      %u\n",
+                printk(KERN_ERR "\tlsave_cnt      %u\n",
                       le32_to_cpu(sup->lsave_cnt));
-                printk(KERN_DEBUG "\tdefault_compr  %u\n",
+                printk(KERN_ERR "\tdefault_compr  %u\n",
                       (int)le16_to_cpu(sup->default_compr));
-                printk(KERN_DEBUG "\trp_size        %llu\n",
+                printk(KERN_ERR "\trp_size        %llu\n",
                       (unsigned long long)le64_to_cpu(sup->rp_size));
-                printk(KERN_DEBUG "\trp_uid         %u\n",
+                printk(KERN_ERR "\trp_uid         %u\n",
                       le32_to_cpu(sup->rp_uid));
-                printk(KERN_DEBUG "\trp_gid         %u\n",
+                printk(KERN_ERR "\trp_gid         %u\n",
                       le32_to_cpu(sup->rp_gid));
-                printk(KERN_DEBUG "\tfmt_version    %u\n",
+                printk(KERN_ERR "\tfmt_version    %u\n",
                       le32_to_cpu(sup->fmt_version));
-                printk(KERN_DEBUG "\ttime_gran      %u\n",
+                printk(KERN_ERR "\ttime_gran      %u\n",
                       le32_to_cpu(sup->time_gran));
-                printk(KERN_DEBUG "\tUUID           %pUB\n",
+                printk(KERN_ERR "\tUUID           %pUB\n",
                       sup->uuid);
                break;
        }
@@ -386,61 +386,61 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
        {
                const struct ubifs_mst_node *mst = node;
-                printk(KERN_DEBUG "\thighest_inum   %llu\n",
+                printk(KERN_ERR "\thighest_inum   %llu\n",
                       (unsigned long long)le64_to_cpu(mst->highest_inum));
-                printk(KERN_DEBUG "\tcommit number  %llu\n",
+                printk(KERN_ERR "\tcommit number  %llu\n",
                       (unsigned long long)le64_to_cpu(mst->cmt_no));
-                printk(KERN_DEBUG "\tflags          %#x\n",
+                printk(KERN_ERR "\tflags          %#x\n",
                       le32_to_cpu(mst->flags));
-                printk(KERN_DEBUG "\tlog_lnum       %u\n",
+                printk(KERN_ERR "\tlog_lnum       %u\n",
                       le32_to_cpu(mst->log_lnum));
-                printk(KERN_DEBUG "\troot_lnum      %u\n",
+                printk(KERN_ERR "\troot_lnum      %u\n",
                       le32_to_cpu(mst->root_lnum));
-                printk(KERN_DEBUG "\troot_offs      %u\n",
+                printk(KERN_ERR "\troot_offs      %u\n",
                       le32_to_cpu(mst->root_offs));
-                printk(KERN_DEBUG "\troot_len       %u\n",
+                printk(KERN_ERR "\troot_len       %u\n",
                       le32_to_cpu(mst->root_len));
-                printk(KERN_DEBUG "\tgc_lnum        %u\n",
+                printk(KERN_ERR "\tgc_lnum        %u\n",
                       le32_to_cpu(mst->gc_lnum));
-                printk(KERN_DEBUG "\tihead_lnum     %u\n",
+                printk(KERN_ERR "\tihead_lnum     %u\n",
                       le32_to_cpu(mst->ihead_lnum));
-                printk(KERN_DEBUG "\tihead_offs     %u\n",
+                printk(KERN_ERR "\tihead_offs     %u\n",
                       le32_to_cpu(mst->ihead_offs));
-                printk(KERN_DEBUG "\tindex_size     %llu\n",
+                printk(KERN_ERR "\tindex_size     %llu\n",
                       (unsigned long long)le64_to_cpu(mst->index_size));
-                printk(KERN_DEBUG "\tlpt_lnum       %u\n",
+                printk(KERN_ERR "\tlpt_lnum       %u\n",
                       le32_to_cpu(mst->lpt_lnum));
-                printk(KERN_DEBUG "\tlpt_offs       %u\n",
+                printk(KERN_ERR "\tlpt_offs       %u\n",
                       le32_to_cpu(mst->lpt_offs));
-                printk(KERN_DEBUG "\tnhead_lnum     %u\n",
+                printk(KERN_ERR "\tnhead_lnum     %u\n",
                       le32_to_cpu(mst->nhead_lnum));
-                printk(KERN_DEBUG "\tnhead_offs     %u\n",
+                printk(KERN_ERR "\tnhead_offs     %u\n",
                       le32_to_cpu(mst->nhead_offs));
-                printk(KERN_DEBUG "\tltab_lnum      %u\n",
+                printk(KERN_ERR "\tltab_lnum      %u\n",
                       le32_to_cpu(mst->ltab_lnum));
-                printk(KERN_DEBUG "\tltab_offs      %u\n",
+                printk(KERN_ERR "\tltab_offs      %u\n",
                       le32_to_cpu(mst->ltab_offs));
-                printk(KERN_DEBUG "\tlsave_lnum     %u\n",
+                printk(KERN_ERR "\tlsave_lnum     %u\n",
                       le32_to_cpu(mst->lsave_lnum));
-                printk(KERN_DEBUG "\tlsave_offs     %u\n",
+                printk(KERN_ERR "\tlsave_offs     %u\n",
                       le32_to_cpu(mst->lsave_offs));
-                printk(KERN_DEBUG "\tlscan_lnum     %u\n",
+                printk(KERN_ERR "\tlscan_lnum     %u\n",
                       le32_to_cpu(mst->lscan_lnum));
-                printk(KERN_DEBUG "\tleb_cnt        %u\n",
+                printk(KERN_ERR "\tleb_cnt        %u\n",
                       le32_to_cpu(mst->leb_cnt));
-                printk(KERN_DEBUG "\tempty_lebs     %u\n",
+                printk(KERN_ERR "\tempty_lebs     %u\n",
                       le32_to_cpu(mst->empty_lebs));
-                printk(KERN_DEBUG "\tidx_lebs       %u\n",
+                printk(KERN_ERR "\tidx_lebs       %u\n",
                       le32_to_cpu(mst->idx_lebs));
-                printk(KERN_DEBUG "\ttotal_free     %llu\n",
+                printk(KERN_ERR "\ttotal_free     %llu\n",
                       (unsigned long long)le64_to_cpu(mst->total_free));
-                printk(KERN_DEBUG "\ttotal_dirty    %llu\n",
+                printk(KERN_ERR "\ttotal_dirty    %llu\n",
                       (unsigned long long)le64_to_cpu(mst->total_dirty));
-                printk(KERN_DEBUG "\ttotal_used     %llu\n",
+                printk(KERN_ERR "\ttotal_used     %llu\n",
                       (unsigned long long)le64_to_cpu(mst->total_used));
-                printk(KERN_DEBUG "\ttotal_dead     %llu\n",
+                printk(KERN_ERR "\ttotal_dead     %llu\n",
                       (unsigned long long)le64_to_cpu(mst->total_dead));
-                printk(KERN_DEBUG "\ttotal_dark     %llu\n",
+                printk(KERN_ERR "\ttotal_dark     %llu\n",
                       (unsigned long long)le64_to_cpu(mst->total_dark));
                break;
        }
@@ -448,11 +448,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
        {
                const struct ubifs_ref_node *ref = node;
-                printk(KERN_DEBUG "\tlnum           %u\n",
+                printk(KERN_ERR "\tlnum           %u\n",
                       le32_to_cpu(ref->lnum));
-                printk(KERN_DEBUG "\toffs           %u\n",
+                printk(KERN_ERR "\toffs           %u\n",
                       le32_to_cpu(ref->offs));
-                printk(KERN_DEBUG "\tjhead          %u\n",
+                printk(KERN_ERR "\tjhead          %u\n",
                       le32_to_cpu(ref->jhead));
                break;
        }
@@ -461,40 +461,40 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                const struct ubifs_ino_node *ino = node;
                key_read(c, &ino->key, &key);
-                printk(KERN_DEBUG "\tkey            %s\n",
+                printk(KERN_ERR "\tkey            %s\n",
                       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
-                printk(KERN_DEBUG "\tcreat_sqnum    %llu\n",
+                printk(KERN_ERR "\tcreat_sqnum    %llu\n",
                       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
-                printk(KERN_DEBUG "\tsize           %llu\n",
+                printk(KERN_ERR "\tsize           %llu\n",
                       (unsigned long long)le64_to_cpu(ino->size));
-                printk(KERN_DEBUG "\tnlink          %u\n",
+                printk(KERN_ERR "\tnlink          %u\n",
                       le32_to_cpu(ino->nlink));
-                printk(KERN_DEBUG "\tatime          %lld.%u\n",
+                printk(KERN_ERR "\tatime          %lld.%u\n",
                       (long long)le64_to_cpu(ino->atime_sec),
                       le32_to_cpu(ino->atime_nsec));
-                printk(KERN_DEBUG "\tmtime          %lld.%u\n",
+                printk(KERN_ERR "\tmtime          %lld.%u\n",
                       (long long)le64_to_cpu(ino->mtime_sec),
                       le32_to_cpu(ino->mtime_nsec));
-                printk(KERN_DEBUG "\tctime          %lld.%u\n",
+                printk(KERN_ERR "\tctime          %lld.%u\n",
                       (long long)le64_to_cpu(ino->ctime_sec),
                       le32_to_cpu(ino->ctime_nsec));
-                printk(KERN_DEBUG "\tuid            %u\n",
+                printk(KERN_ERR "\tuid            %u\n",
                       le32_to_cpu(ino->uid));
-                printk(KERN_DEBUG "\tgid            %u\n",
+                printk(KERN_ERR "\tgid            %u\n",
                       le32_to_cpu(ino->gid));
-                printk(KERN_DEBUG "\tmode           %u\n",
+                printk(KERN_ERR "\tmode           %u\n",
                       le32_to_cpu(ino->mode));
-                printk(KERN_DEBUG "\tflags          %#x\n",
+                printk(KERN_ERR "\tflags          %#x\n",
                       le32_to_cpu(ino->flags));
-                printk(KERN_DEBUG "\txattr_cnt      %u\n",
+                printk(KERN_ERR "\txattr_cnt      %u\n",
                       le32_to_cpu(ino->xattr_cnt));
-                printk(KERN_DEBUG "\txattr_size     %u\n",
+                printk(KERN_ERR "\txattr_size     %u\n",
                       le32_to_cpu(ino->xattr_size));
-                printk(KERN_DEBUG "\txattr_names    %u\n",
+                printk(KERN_ERR "\txattr_names    %u\n",
                       le32_to_cpu(ino->xattr_names));
-                printk(KERN_DEBUG "\tcompr_type     %#x\n",
+                printk(KERN_ERR "\tcompr_type     %#x\n",
                       (int)le16_to_cpu(ino->compr_type));
-                printk(KERN_DEBUG "\tdata len       %u\n",
+                printk(KERN_ERR "\tdata len       %u\n",
                       le32_to_cpu(ino->data_len));
                break;
        }
@@ -505,16 +505,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                int nlen = le16_to_cpu(dent->nlen);
                key_read(c, &dent->key, &key);
-                printk(KERN_DEBUG "\tkey            %s\n",
+                printk(KERN_ERR "\tkey            %s\n",
                       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
-                printk(KERN_DEBUG "\tinum           %llu\n",
+                printk(KERN_ERR "\tinum           %llu\n",
                       (unsigned long long)le64_to_cpu(dent->inum));
-                printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type);
+                printk(KERN_ERR "\ttype           %d\n", (int)dent->type);
-                printk(KERN_DEBUG "\tnlen           %d\n", nlen);
+                printk(KERN_ERR "\tnlen           %d\n", nlen);
-                printk(KERN_DEBUG "\tname           ");
+                printk(KERN_ERR "\tname           ");
                if (nlen > UBIFS_MAX_NLEN)
-                        printk(KERN_DEBUG "(bad name length, not printing, "
+                        printk(KERN_ERR "(bad name length, not printing, "
                                          "bad or corrupted node)");
                else {
                        for (i = 0; i < nlen && dent->name[i]; i++)
@@ -530,16 +530,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
                key_read(c, &dn->key, &key);
-                printk(KERN_DEBUG "\tkey            %s\n",
+                printk(KERN_ERR "\tkey            %s\n",
                       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
-                printk(KERN_DEBUG "\tsize           %u\n",
+                printk(KERN_ERR "\tsize           %u\n",
                       le32_to_cpu(dn->size));
-                printk(KERN_DEBUG "\tcompr_typ      %d\n",
+                printk(KERN_ERR "\tcompr_typ      %d\n",
                       (int)le16_to_cpu(dn->compr_type));
-                printk(KERN_DEBUG "\tdata size      %d\n",
+                printk(KERN_ERR "\tdata size      %d\n",
                       dlen);
-                printk(KERN_DEBUG "\tdata:\n");
+                printk(KERN_ERR "\tdata:\n");
-                print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+                print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
                               (void *)&dn->data, dlen, 0);
                break;
        }
@@ -547,11 +547,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
        {
                const struct ubifs_trun_node *trun = node;
-                printk(KERN_DEBUG "\tinum           %u\n",
+                printk(KERN_ERR "\tinum           %u\n",
                       le32_to_cpu(trun->inum));
-                printk(KERN_DEBUG "\told_size       %llu\n",
+                printk(KERN_ERR "\told_size       %llu\n",
                       (unsigned long long)le64_to_cpu(trun->old_size));
-                printk(KERN_DEBUG "\tnew_size       %llu\n",
+                printk(KERN_ERR "\tnew_size       %llu\n",
                       (unsigned long long)le64_to_cpu(trun->new_size));
                break;
        }
@@ -560,17 +560,17 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                const struct ubifs_idx_node *idx = node;
                n = le16_to_cpu(idx->child_cnt);
-                printk(KERN_DEBUG "\tchild_cnt      %d\n", n);
+                printk(KERN_ERR "\tchild_cnt      %d\n", n);
-                printk(KERN_DEBUG "\tlevel          %d\n",
+                printk(KERN_ERR "\tlevel          %d\n",
                       (int)le16_to_cpu(idx->level));
-                printk(KERN_DEBUG "\tBranches:\n");
+                printk(KERN_ERR "\tBranches:\n");
                for (i = 0; i < n && i < c->fanout - 1; i++) {
                        const struct ubifs_branch *br;
                        br = ubifs_idx_branch(c, idx, i);
                        key_read(c, &br->key, &key);
-                        printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+                        printk(KERN_ERR "\t%d: LEB %d:%d len %d key %s\n",
                               i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
                               le32_to_cpu(br->len),
                               dbg_snprintf_key(c, &key, key_buf,
@@ -584,20 +584,20 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
        {
                const struct ubifs_orph_node *orph = node;
-                printk(KERN_DEBUG "\tcommit number  %llu\n",
+                printk(KERN_ERR "\tcommit number  %llu\n",
                       (unsigned long long)
                                le64_to_cpu(orph->cmt_no) & LLONG_MAX);
-                printk(KERN_DEBUG "\tlast node flag %llu\n",
+                printk(KERN_ERR "\tlast node flag %llu\n",
                       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
                n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
-                printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+                printk(KERN_ERR "\t%d orphan inode numbers:\n", n);
                for (i = 0; i < n; i++)
-                        printk(KERN_DEBUG "\t  ino %llu\n",
+                        printk(KERN_ERR "\t  ino %llu\n",
                               (unsigned long long)le64_to_cpu(orph->inos[i]));
                break;
        }
        default:
-                printk(KERN_DEBUG "node type %d was not recognized\n",
+                printk(KERN_ERR "node type %d was not recognized\n",
                       (int)ch->node_type);
        }
        spin_unlock(&dbg_lock);
@@ -606,16 +606,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 void dbg_dump_budget_req(const struct ubifs_budget_req *req)
 {
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+        printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n",
               req->new_ino, req->dirtied_ino);
-        printk(KERN_DEBUG "\tnew_ino_d   %d, dirtied_ino_d %d\n",
+        printk(KERN_ERR "\tnew_ino_d   %d, dirtied_ino_d %d\n",
               req->new_ino_d, req->dirtied_ino_d);
-        printk(KERN_DEBUG "\tnew_page    %d, dirtied_page %d\n",
+        printk(KERN_ERR "\tnew_page    %d, dirtied_page %d\n",
               req->new_page, req->dirtied_page);
-        printk(KERN_DEBUG "\tnew_dent    %d, mod_dent     %d\n",
+        printk(KERN_ERR "\tnew_dent    %d, mod_dent     %d\n",
               req->new_dent, req->mod_dent);
-        printk(KERN_DEBUG "\tidx_growth  %d\n", req->idx_growth);
+        printk(KERN_ERR "\tidx_growth  %d\n", req->idx_growth);
-        printk(KERN_DEBUG "\tdata_growth %d dd_growth     %d\n",
+        printk(KERN_ERR "\tdata_growth %d dd_growth     %d\n",
               req->data_growth, req->dd_growth);
        spin_unlock(&dbg_lock);
 }
@@ -623,12 +623,12 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
 {
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
+        printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, "
               "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
-        printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
+        printk(KERN_ERR "\ttaken_empty_lebs %d, total_free %lld, "
               "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
               lst->total_dirty);
-        printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
+        printk(KERN_ERR "\ttotal_used %lld, total_dark %lld, "
               "total_dead %lld\n", lst->total_used, lst->total_dark,
               lst->total_dead);
        spin_unlock(&dbg_lock);
@@ -644,21 +644,21 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
        spin_lock(&c->space_lock);
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
+        printk(KERN_ERR "(pid %d) Budgeting info: data budget sum %lld, "
               "total budget sum %lld\n", current->pid,
               bi->data_growth + bi->dd_growth,
               bi->data_growth + bi->dd_growth + bi->idx_growth);
-        printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
+        printk(KERN_ERR "\tbudg_data_growth %lld, budg_dd_growth %lld, "
               "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
               bi->idx_growth);
-        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
+        printk(KERN_ERR "\tmin_idx_lebs %d, old_idx_sz %llu, "
               "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
               bi->uncommitted_idx);
-        printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+        printk(KERN_ERR "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
               bi->page_budget, bi->inode_budget, bi->dent_budget);
-        printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+        printk(KERN_ERR "\tnospace %u, nospace_rp %u\n",
               bi->nospace, bi->nospace_rp);
-        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+        printk(KERN_ERR "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
        if (bi != &c->bi)
@@ -669,38 +669,38 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
                 */
                goto out_unlock;
-        printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+        printk(KERN_ERR "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
               c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
-        printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
+        printk(KERN_ERR "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
               "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
               atomic_long_read(&c->dirty_zn_cnt),
               atomic_long_read(&c->clean_zn_cnt));
-        printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
+        printk(KERN_ERR "\tgc_lnum %d, ihead_lnum %d\n",
               c->gc_lnum, c->ihead_lnum);
        /* If we are in R/O mode, journal heads do not exist */
        if (c->jheads)
                for (i = 0; i < c->jhead_cnt; i++)
-                        printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
+                        printk(KERN_ERR "\tjhead %s\t LEB %d\n",
                               dbg_jhead(c->jheads[i].wbuf.jhead),
                               c->jheads[i].wbuf.lnum);
        for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
                bud = rb_entry(rb, struct ubifs_bud, rb);
-                printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+                printk(KERN_ERR "\tbud LEB %d\n", bud->lnum);
        }
        list_for_each_entry(bud, &c->old_buds, list)
-                printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+                printk(KERN_ERR "\told bud LEB %d\n", bud->lnum);
        list_for_each_entry(idx_gc, &c->idx_gc, list)
-                printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+                printk(KERN_ERR "\tGC'ed idx LEB %d unmap %d\n",
                       idx_gc->lnum, idx_gc->unmap);
-        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+        printk(KERN_ERR "\tcommit state %d\n", c->cmt_state);
        /* Print budgeting predictions */
        available = ubifs_calc_available(c, c->bi.min_idx_lebs);
        outstanding = c->bi.data_growth + c->bi.dd_growth;
        free = ubifs_get_free_space_nolock(c);
-        printk(KERN_DEBUG "Budgeting predictions:\n");
+        printk(KERN_ERR "Budgeting predictions:\n");
-        printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+        printk(KERN_ERR "\tavailable: %lld, outstanding %lld, free %lld\n",
               available, outstanding, free);
 out_unlock:
        spin_unlock(&dbg_lock);
@@ -720,11 +720,11 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
                dark = ubifs_calc_dark(c, spc);
        if (lp->flags & LPROPS_INDEX)
-                printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+                printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "
                       "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
                       lp->dirty, c->leb_size - spc, spc, lp->flags);
        else
-                printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+                printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "
                       "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
                       "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
                       c->leb_size - spc, spc, dark, dead,
@@ -807,7 +807,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
        struct ubifs_lprops lp;
        struct ubifs_lp_stats lst;
-        printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+        printk(KERN_ERR "(pid %d) start dumping LEB properties\n",
               current->pid);
        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
@@ -819,7 +819,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
                dbg_dump_lprop(c, &lp);
        }
-        printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+        printk(KERN_ERR "(pid %d) finish dumping LEB properties\n",
               current->pid);
 }
@@ -828,35 +828,35 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        int i;
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
+        printk(KERN_ERR "(pid %d) dumping LPT information\n", current->pid);
-        printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
+        printk(KERN_ERR "\tlpt_sz:        %lld\n", c->lpt_sz);
-        printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
+        printk(KERN_ERR "\tpnode_sz:      %d\n", c->pnode_sz);
-        printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
+        printk(KERN_ERR "\tnnode_sz:      %d\n", c->nnode_sz);
-        printk(KERN_DEBUG "\tltab_sz:       %d\n", c->ltab_sz);
+        printk(KERN_ERR "\tltab_sz:       %d\n", c->ltab_sz);
-        printk(KERN_DEBUG "\tlsave_sz:      %d\n", c->lsave_sz);
+        printk(KERN_ERR "\tlsave_sz:      %d\n", c->lsave_sz);
-        printk(KERN_DEBUG "\tbig_lpt:       %d\n", c->big_lpt);
+        printk(KERN_ERR "\tbig_lpt:       %d\n", c->big_lpt);
-        printk(KERN_DEBUG "\tlpt_hght:      %d\n", c->lpt_hght);
+        printk(KERN_ERR "\tlpt_hght:      %d\n", c->lpt_hght);
-        printk(KERN_DEBUG "\tpnode_cnt:     %d\n", c->pnode_cnt);
+        printk(KERN_ERR "\tpnode_cnt:     %d\n", c->pnode_cnt);
-        printk(KERN_DEBUG "\tnnode_cnt:     %d\n", c->nnode_cnt);
+        printk(KERN_ERR "\tnnode_cnt:     %d\n", c->nnode_cnt);
-        printk(KERN_DEBUG "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
+        printk(KERN_ERR "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
-        printk(KERN_DEBUG "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
+        printk(KERN_ERR "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
-        printk(KERN_DEBUG "\tlsave_cnt:     %d\n", c->lsave_cnt);
+        printk(KERN_ERR "\tlsave_cnt:     %d\n", c->lsave_cnt);
-        printk(KERN_DEBUG "\tspace_bits:    %d\n", c->space_bits);
+        printk(KERN_ERR "\tspace_bits:    %d\n", c->space_bits);
-        printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+        printk(KERN_ERR "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
-        printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+        printk(KERN_ERR "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
-        printk(KERN_DEBUG "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
+        printk(KERN_ERR "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
-        printk(KERN_DEBUG "\tpcnt_bits:     %d\n", c->pcnt_bits);
+        printk(KERN_ERR "\tpcnt_bits:     %d\n", c->pcnt_bits);
-        printk(KERN_DEBUG "\tlnum_bits:     %d\n", c->lnum_bits);
+        printk(KERN_ERR "\tlnum_bits:     %d\n", c->lnum_bits);
-        printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+        printk(KERN_ERR "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
-        printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
+        printk(KERN_ERR "\tLPT head is at %d:%d\n",
               c->nhead_lnum, c->nhead_offs);
-        printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+        printk(KERN_ERR "\tLPT ltab is at %d:%d\n",
               c->ltab_lnum, c->ltab_offs);
        if (c->big_lpt)
-                printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
+                printk(KERN_ERR "\tLPT lsave is at %d:%d\n",
                       c->lsave_lnum, c->lsave_offs);
        for (i = 0; i < c->lpt_lebs; i++)
-                printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d "
+                printk(KERN_ERR "\tLPT LEB %d free %d dirty %d tgc %d "
                       "cmt %d\n", i + c->lpt_first, c->ltab[i].free,
                       c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
        spin_unlock(&dbg_lock);
@@ -867,12 +867,12 @@ void dbg_dump_sleb(const struct ubifs_info *c,
 {
        struct ubifs_scan_node *snod;
-        printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
+        printk(KERN_ERR "(pid %d) start dumping scanned data from LEB %d:%d\n",
               current->pid, sleb->lnum, offs);
        list_for_each_entry(snod, &sleb->nodes, list) {
                cond_resched();
-                printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
+                printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
                       snod->offs, snod->len);
                dbg_dump_node(c, snod->node);
        }
@@ -887,7 +887,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        if (dbg_is_tst_rcvry(c))
                return;
-        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+        printk(KERN_ERR "(pid %d) start dumping LEB %d\n",
               current->pid, lnum);
        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
@@ -902,17 +902,17 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
                goto out;
        }
-        printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+        printk(KERN_ERR "LEB %d has %d nodes ending at %d\n", lnum,
               sleb->nodes_cnt, sleb->endpt);
        list_for_each_entry(snod, &sleb->nodes, list) {
                cond_resched();
-                printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+                printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum,
                       snod->offs, snod->len);
                dbg_dump_node(c, snod->node);
        }
-        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+        printk(KERN_ERR "(pid %d) finish dumping LEB %d\n",
               current->pid, lnum);
        ubifs_scan_destroy(sleb);
@@ -934,7 +934,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
        else
                zbr = &c->zroot;
-        printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
+        printk(KERN_ERR "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
               " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
               zbr->len, znode->parent, znode->iip, znode->level,
               znode->child_cnt, znode->flags);
@@ -944,18 +944,18 @@ void dbg_dump_znode(const struct ubifs_info *c,
                return;
        }
-        printk(KERN_DEBUG "zbranches:\n");
+        printk(KERN_ERR "zbranches:\n");
        for (n = 0; n < znode->child_cnt; n++) {
                zbr = &znode->zbranch[n];
                if (znode->level > 0)
-                        printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
+                        printk(KERN_ERR "\t%d: znode %p LEB %d:%d len %d key "
                                          "%s\n", n, zbr->znode, zbr->lnum,
                                          zbr->offs, zbr->len,
                                          dbg_snprintf_key(c, &zbr->key,
                                                           key_buf,
                                                           DBG_KEY_BUF_LEN));
                else
-                        printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
+                        printk(KERN_ERR "\t%d: LNC %p LEB %d:%d len %d key "
                                          "%s\n", n, zbr->znode, zbr->lnum,
                                          zbr->offs, zbr->len,
                                          dbg_snprintf_key(c, &zbr->key,
@@ -969,16 +969,16 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
+        printk(KERN_ERR "(pid %d) start dumping heap cat %d (%d elements)\n",
               current->pid, cat, heap->cnt);
        for (i = 0; i < heap->cnt; i++) {
                struct ubifs_lprops *lprops = heap->arr[i];
-                printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
+                printk(KERN_ERR "\t%d. LEB %d hpos %d free %d dirty %d "
                       "flags %d\n", i, lprops->lnum, lprops->hpos,
                       lprops->free, lprops->dirty, lprops->flags);
        }
-        printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
+        printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid);
 }
 void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -986,15 +986,15 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
+        printk(KERN_ERR "(pid %d) dumping pnode:\n", current->pid);
-        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+        printk(KERN_ERR "\taddress %zx parent %zx cnext %zx\n",
               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
-        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+        printk(KERN_ERR "\tflags %lu iip %d level %d num %d\n",
               pnode->flags, iip, pnode->level, pnode->num);
        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
                struct ubifs_lprops *lp = &pnode->lprops[i];
-                printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+                printk(KERN_ERR "\t%d: free %d dirty %d flags %d lnum %d\n",
                       i, lp->free, lp->dirty, lp->flags, lp->lnum);
        }
 }
@@ -1004,20 +1004,20 @@ void dbg_dump_tnc(struct ubifs_info *c)
        struct ubifs_znode *znode;
        int level;
-        printk(KERN_DEBUG "\n");
+        printk(KERN_ERR "\n");
-        printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
+        printk(KERN_ERR "(pid %d) start dumping TNC tree\n", current->pid);
        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
        level = znode->level;
-        printk(KERN_DEBUG "== Level %d ==\n", level);
+        printk(KERN_ERR "== Level %d ==\n", level);
        while (znode) {
                if (level != znode->level) {
                        level = znode->level;
-                        printk(KERN_DEBUG "== Level %d ==\n", level);
+                        printk(KERN_ERR "== Level %d ==\n", level);
                }
                dbg_dump_znode(c, znode);
                znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
        }
-        printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
+        printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid);
 }
 static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index ad1a6fee6010..9f717655df18 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -164,9 +164,7 @@ struct ubifs_global_debug_info {
 #define dbg_dump_stack() dump_stack()
 #define dbg_err(fmt, ...) do {                                                 \
-        spin_lock(&dbg_lock);                                                  \
        ubifs_err(fmt, ##__VA_ARGS__);                                         \
-        spin_unlock(&dbg_lock);                                                \
 } while (0)
 #define ubifs_dbg_msg(type, fmt, ...) \
@@ -217,7 +215,6 @@ struct ubifs_global_debug_info {
 /* Additional recovery messages */
 #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
-extern spinlock_t dbg_lock;
 extern struct ubifs_global_debug_info ubifs_dbg;
 static inline int dbg_is_chk_gen(const struct ubifs_info *c)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index d6fe1c79f18b..ec9f1870ab7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -566,6 +566,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        int err, budgeted = 1;
        struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+        unsigned int saved_nlink = inode->i_nlink;
        /*
         * Budget request settings: deletion direntry, deletion inode (+1 for
@@ -613,7 +614,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 out_cancel:
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
-        inc_nlink(inode);
+        set_nlink(inode, saved_nlink);
        unlock_2_inodes(dir, inode);
        if (budgeted)
                ubifs_release_budget(c, &req);
@@ -704,8 +705,7 @@ out_cancel:
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
        inc_nlink(dir);
-        inc_nlink(inode);
+        set_nlink(inode, 2);
-        inc_nlink(inode);
        unlock_2_inodes(dir, inode);
        if (budgeted)
                ubifs_release_budget(c, &req);
@@ -977,6 +977,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
                        .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
        struct timespec time;
+        unsigned int saved_nlink;
        /*
         * Budget request settings: deletion direntry, new direntry, removing
@@ -1059,13 +1060,14 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (unlink) {
                /*
                 * Directories cannot have hard-links, so if this is a
-                 * directory, decrement its @i_nlink twice because an empty
+                 * directory, just clear @i_nlink.
-                 * directory has @i_nlink 2.
                 */
+                saved_nlink = new_inode->i_nlink;
                if (is_dir)
+                        clear_nlink(new_inode);
+                else
                        drop_nlink(new_inode);
                new_inode->i_ctime = time;
-                drop_nlink(new_inode);
        } else {
                new_dir->i_size += new_sz;
                ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@ -1102,9 +1104,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 out_cancel:
        if (unlink) {
-                if (is_dir)
+                set_nlink(new_inode, saved_nlink);
-                        inc_nlink(new_inode);
-                inc_nlink(new_inode);
        } else {
                new_dir->i_size -= new_sz;
                ubifs_inode(new_dir)->ui_size = new_dir->i_size;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index ee4f43f4bb99..2a935b317232 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -679,7 +679,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                           ret == SCANNED_GARBAGE     ||
                           ret == SCANNED_A_BAD_PAD_NODE ||
                           ret == SCANNED_A_CORRUPT_NODE) {
-                        dbg_rcvry("found corruption - %d", ret);
+                        dbg_rcvry("found corruption (%d) at %d:%d",
+                                  ret, lnum, offs);
                        break;
                } else {
                        dbg_err("unexpected return value %d", ret);
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 6094c5a5d7a8..771f7fb6ce92 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -410,13 +410,23 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
        }
        if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
-                err = 7;
+                ubifs_err("too few main LEBs count %d, must be at least %d",
+                          c->main_lebs, UBIFS_MIN_MAIN_LEBS);
                goto failed;
        }
-        if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
+        max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS;
-            c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
+        if (c->max_bud_bytes < max_bytes) {
-                err = 8;
+                ubifs_err("too small journal (%lld bytes), must be at least "
+                          "%lld bytes",  c->max_bud_bytes, max_bytes);
+                goto failed;
+        }
+        max_bytes = (long long)c->leb_size * c->main_lebs;
+        if (c->max_bud_bytes > max_bytes) {
+                ubifs_err("too large journal size (%lld bytes), only %lld bytes"
+                          "available in the main area",
+                          c->max_bud_bytes, max_bytes);
                goto failed;
        }
@@ -450,7 +460,6 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
                goto failed;
        }
-        max_bytes = c->main_lebs * (long long)c->leb_size;
        if (c->rp_size < 0 || max_bytes < c->rp_size) {
                err = 14;
                goto failed;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 12e94774aa88..93d59aceaaef 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -84,9 +84,6 @@
 #define INUM_WARN_WATERMARK 0xFFF00000
 #define INUM_WATERMARK      0xFFFFFF00
-/* Largest key size supported in this implementation */
-#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
 /* Maximum number of entries in each LPT (LEB category) heap */
 #define LPT_HEAP_SZ 256
@@ -277,10 +274,10 @@ struct ubifs_old_idx {
 /* The below union makes it easier to deal with keys */
 union ubifs_key {
-        uint8_t u8[CUR_MAX_KEY_LEN];
+        uint8_t u8[UBIFS_SK_LEN];
-        uint32_t u32[CUR_MAX_KEY_LEN/4];
+        uint32_t u32[UBIFS_SK_LEN/4];
-        uint64_t u64[CUR_MAX_KEY_LEN/8];
+        uint64_t u64[UBIFS_SK_LEN/8];
-        __le32 j32[CUR_MAX_KEY_LEN/4];
+        __le32 j32[UBIFS_SK_LEN/4];
 };
 /**
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 987585bb0a1d..1ba2baaf4367 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -105,7 +105,6 @@ static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
 }
 static void udf_bitmap_free_blocks(struct super_block *sb,
-                                   struct inode *inode,
                                   struct udf_bitmap *bitmap,
                                   struct kernel_lb_addr *bloc,
                                   uint32_t offset,
@@ -172,7 +171,6 @@ error_return:
 }
 static int udf_bitmap_prealloc_blocks(struct super_block *sb,
-                                      struct inode *inode,
                                      struct udf_bitmap *bitmap,
                                      uint16_t partition, uint32_t first_block,
                                      uint32_t block_count)
@@ -223,7 +221,6 @@ out:
 }
 static int udf_bitmap_new_block(struct super_block *sb,
-                                struct inode *inode,
                                struct udf_bitmap *bitmap, uint16_t partition,
                                uint32_t goal, int *err)
 {
@@ -349,7 +346,6 @@ error_return:
 }
 static void udf_table_free_blocks(struct super_block *sb,
-                                  struct inode *inode,
                                  struct inode *table,
                                  struct kernel_lb_addr *bloc,
                                  uint32_t offset,
@@ -581,7 +577,6 @@ error_return:
 }
 static int udf_table_prealloc_blocks(struct super_block *sb,
-                                     struct inode *inode,
                                     struct inode *table, uint16_t partition,
                                     uint32_t first_block, uint32_t block_count)
 {
@@ -643,7 +638,6 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 }
 static int udf_table_new_block(struct super_block *sb,
-                               struct inode *inode,
                               struct inode *table, uint16_t partition,
                               uint32_t goal, int *err)
 {
@@ -743,18 +737,23 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode,
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
-                udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
+                udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap,
                                       bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
-                udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
+                udf_table_free_blocks(sb, map->s_uspace.s_table,
                                      bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
-                udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
+                udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,
                                       bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
-                udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
+                udf_table_free_blocks(sb, map->s_fspace.s_table,
                                      bloc, offset, count);
        }
+        if (inode) {
+                inode_sub_bytes(inode,
+                                ((sector_t)count) << sb->s_blocksize_bits);
+        }
 }
 inline int udf_prealloc_blocks(struct super_block *sb,
@@ -763,29 +762,34 @@ inline int udf_prealloc_blocks(struct super_block *sb,
                               uint32_t block_count)
 {
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+        sector_t allocated;
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
-                return udf_bitmap_prealloc_blocks(sb, inode,
+                allocated = udf_bitmap_prealloc_blocks(sb,
-                                                  map->s_uspace.s_bitmap,
+                                                       map->s_uspace.s_bitmap,
-                                                  partition, first_block,
+                                                       partition, first_block,
-                                                  block_count);
+                                                       block_count);
        else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
-                return udf_table_prealloc_blocks(sb, inode,
+                allocated = udf_table_prealloc_blocks(sb,
-                                                 map->s_uspace.s_table,
+                                                      map->s_uspace.s_table,
-                                                 partition, first_block,
+                                                      partition, first_block,
-                                                 block_count);
+                                                      block_count);
        else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
-                return udf_bitmap_prealloc_blocks(sb, inode,
+                allocated = udf_bitmap_prealloc_blocks(sb,
-                                                  map->s_fspace.s_bitmap,
+                                                       map->s_fspace.s_bitmap,
-                                                  partition, first_block,
+                                                       partition, first_block,
-                                                  block_count);
+                                                       block_count);
        else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
-                return udf_table_prealloc_blocks(sb, inode,
+                allocated = udf_table_prealloc_blocks(sb,
-                                                 map->s_fspace.s_table,
+                                                      map->s_fspace.s_table,
-                                                 partition, first_block,
+                                                      partition, first_block,
-                                                 block_count);
+                                                      block_count);
        else
                return 0;
+        if (inode && allocated > 0)
+                inode_add_bytes(inode, allocated << sb->s_blocksize_bits);
+        return allocated;
 }
 inline int udf_new_block(struct super_block *sb,
@@ -793,25 +797,29 @@ inline int udf_new_block(struct super_block *sb,
                         uint16_t partition, uint32_t goal, int *err)
 {
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+        int block;
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
-                return udf_bitmap_new_block(sb, inode,
+                block = udf_bitmap_new_block(sb,
-                                           map->s_uspace.s_bitmap,
+                                             map->s_uspace.s_bitmap,
-                                           partition, goal, err);
+                                             partition, goal, err);
        else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
-                return udf_table_new_block(sb, inode,
+                block = udf_table_new_block(sb,
-                                           map->s_uspace.s_table,
+                                            map->s_uspace.s_table,
-                                           partition, goal, err);
-        else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
-                return udf_bitmap_new_block(sb, inode,
-                                            map->s_fspace.s_bitmap,
                                            partition, goal, err);
+        else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+                block = udf_bitmap_new_block(sb,
+                                             map->s_fspace.s_bitmap,
+                                             partition, goal, err);
        else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
-                return udf_table_new_block(sb, inode,
+                block = udf_table_new_block(sb,
-                                           map->s_fspace.s_table,
+                                            map->s_fspace.s_table,
-                                           partition, goal, err);
+                                            partition, goal, err);
        else {
                *err = -EIO;
                return 0;
        }
+        if (inode && block)
+                inode_add_bytes(inode, sb->s_blocksize);
+        return block;
 }
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 05ab48195be9..7e5aae4bf46f 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -116,6 +116,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
        iinfo->i_use = 0;
+        iinfo->i_checkpoint = 1;
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
        else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7699df7b3198..7d7528008359 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1358,6 +1358,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                iinfo->i_unique = le64_to_cpu(fe->uniqueID);
                iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
                iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs);
+                iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint);
                offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr;
        } else {
                inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
@@ -1379,6 +1380,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                iinfo->i_unique = le64_to_cpu(efe->uniqueID);
                iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
                iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
+                iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
                offset = sizeof(struct extendedFileEntry) +
                                                        iinfo->i_lenEAttr;
        }
@@ -1495,6 +1497,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        struct buffer_head *bh = NULL;
        struct fileEntry *fe;
        struct extendedFileEntry *efe;
+        uint64_t lb_recorded;
        uint32_t udfperms;
        uint16_t icbflags;
        uint16_t crclen;
@@ -1589,13 +1592,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                dsea->minorDeviceIdent = cpu_to_le32(iminor(inode));
        }
+        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+                lb_recorded = 0; /* No extents => no blocks! */
+        else
+                lb_recorded =
+                        (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
+                        (blocksize_bits - 9);
        if (iinfo->i_efe == 0) {
                memcpy(bh->b_data + sizeof(struct fileEntry),
                       iinfo->i_ext.i_data,
                       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
-                fe->logicalBlocksRecorded = cpu_to_le64(
+                fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
-                        (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
-                        (blocksize_bits - 9));
                udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
                udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
@@ -1607,6 +1615,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                fe->uniqueID = cpu_to_le64(iinfo->i_unique);
                fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
                fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+                fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
                fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE);
                crclen = sizeof(struct fileEntry);
        } else {
@@ -1615,9 +1624,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                       inode->i_sb->s_blocksize -
                                        sizeof(struct extendedFileEntry));
                efe->objectSize = cpu_to_le64(inode->i_size);
-                efe->logicalBlocksRecorded = cpu_to_le64(
+                efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
-                        (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
-                        (blocksize_bits - 9));
                if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec ||
                    (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec &&
@@ -1646,6 +1653,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                efe->uniqueID = cpu_to_le64(iinfo->i_unique);
                efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
                efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+                efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
                efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
                crclen = sizeof(struct extendedFileEntry);
        }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 85067b4c7e14..ac8a348dcb69 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -950,11 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
        else
                bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
-        if (bitmap == NULL) {
+        if (bitmap == NULL)
-                udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
-                        nr_groups);
                return NULL;
-        }
        bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
        bitmap->s_nr_groups = nr_groups;
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index d1bd31ea724e..bb8309dcd5c1 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -23,6 +23,7 @@ struct udf_inode_info {
        __u64                   i_lenExtents;
        __u32                   i_next_alloc_block;
        __u32                   i_next_alloc_goal;
+        __u32                   i_checkpoint;
        unsigned                i_alloc_type : 3;
        unsigned                i_efe : 1;      /* extendedFileEntry */
        unsigned                i_use : 1;      /* unallocSpaceEntry */
diff --git a/fs/xattr.c b/fs/xattr.c
index 82f43376c7cd..d6dfd247bb2f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -16,7 +16,7 @@
 #include <linux/security.h>
 #include <linux/evm.h>
 #include <linux/syscalls.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/fsnotify.h>
 #include <linux/audit.h>
 #include <asm/uaccess.h>
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 8d5a506c82eb..69d06b07b169 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -5,7 +5,7 @@
 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfp.h>
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 427a4e82a588..0a9977983f92 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -96,9 +96,6 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
                                   xfs_qm_bhv.o \
                                   xfs_qm.o \
                                   xfs_quotaops.o
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)           += xfs_qm_stats.o
-endif
 xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
 xfs-$(CONFIG_PROC_FS)           += xfs_stats.o
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index ce84ffd0264c..0f0df2759b09 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -35,6 +35,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
+struct workqueue_struct *xfs_alloc_wq;
 #define XFS_ABSDIFF(a,b)        (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
@@ -68,7 +69,7 @@ xfs_alloc_lookup_eq(
 * Lookup the first record greater than or equal to [bno, len]
 * in the btree given by cur.
 */
-STATIC int                              /* error */
+int                             /* error */
 xfs_alloc_lookup_ge(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -2207,7 +2208,7 @@ xfs_alloc_read_agf(
 * group or loop over the allocation groups to find the result.
 */
 int                             /* error */
-xfs_alloc_vextent(
+__xfs_alloc_vextent(
        xfs_alloc_arg_t *args)  /* allocation argument structure */
 {
        xfs_agblock_t   agsize; /* allocation group size */
@@ -2417,6 +2418,37 @@ error0:
        return error;
 }
+static void
+xfs_alloc_vextent_worker(
+        struct work_struct      *work)
+{
+        struct xfs_alloc_arg    *args = container_of(work,
+                                                struct xfs_alloc_arg, work);
+        unsigned long           pflags;
+        /* we are in a transaction context here */
+        current_set_flags_nested(&pflags, PF_FSTRANS);
+        args->result = __xfs_alloc_vextent(args);
+        complete(args->done);
+        current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+int                             /* error */
+xfs_alloc_vextent(
+        xfs_alloc_arg_t *args)  /* allocation argument structure */
+{
+        DECLARE_COMPLETION_ONSTACK(done);
+        args->done = &done;
+        INIT_WORK(&args->work, xfs_alloc_vextent_worker);
+        queue_work(xfs_alloc_wq, &args->work);
+        wait_for_completion(&done);
+        return args->result;
+}
 /*
 * Free an extent.
 * Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2f52b924be79..3a7e7d8f8ded 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -25,6 +25,8 @@ struct xfs_perag;
 struct xfs_trans;
 struct xfs_busy_extent;
+extern struct workqueue_struct *xfs_alloc_wq;
 /*
 * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
 */
@@ -119,6 +121,9 @@ typedef struct xfs_alloc_arg {
        char            isfl;           /* set if is freelist blocks - !acctg */
        char            userdata;       /* set if this is user data */
        xfs_fsblock_t   firstblock;     /* io first block allocated */
+        struct completion *done;
+        struct work_struct work;
+        int             result;
 } xfs_alloc_arg_t;
 /*
@@ -243,6 +248,13 @@ xfs_alloc_lookup_le(
        xfs_extlen_t            len,    /* length of extent */
        int                     *stat); /* success/failure */
+int                             /* error */
+xfs_alloc_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
 int                                     /* error */
 xfs_alloc_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 74b9baf36ac3..0dbb9e70fe21 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -26,6 +26,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
@@ -99,23 +100,6 @@ xfs_destroy_ioend(
 }
 /*
- * If the end of the current ioend is beyond the current EOF,
- * return the new EOF value, otherwise zero.
- */
-STATIC xfs_fsize_t
-xfs_ioend_new_eof(
-        xfs_ioend_t             *ioend)
-{
-        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
-        xfs_fsize_t             isize;
-        xfs_fsize_t             bsize;
-        bsize = ioend->io_offset + ioend->io_size;
-        isize = MIN(i_size_read(VFS_I(ip)), bsize);
-        return isize > ip->i_d.di_size ? isize : 0;
-}
-/*
 * Fast and loose check if this write could update the on-disk inode size.
 */
 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
@@ -124,32 +108,65 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
                XFS_I(ioend->io_inode)->i_d.di_size;
 }
+STATIC int
+xfs_setfilesize_trans_alloc(
+        struct xfs_ioend        *ioend)
+{
+        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
+        struct xfs_trans        *tp;
+        int                     error;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+        error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        ioend->io_append_trans = tp;
+        /*
+         * We hand off the transaction to the completion thread now, so
+         * clear the flag here.
+         */
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        return 0;
+}
 /*
 * Update on-disk file size now that data has been written to disk.
- *
- * This function does not block as blocking on the inode lock in IO completion
- * can lead to IO completion order dependency deadlocks.. If it can't get the
- * inode ilock it will return EAGAIN. Callers must handle this.
 */
 STATIC int
 xfs_setfilesize(
-        xfs_ioend_t             *ioend)
+        struct xfs_ioend        *ioend)
 {
-        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+        struct xfs_trans        *tp = ioend->io_append_trans;
        xfs_fsize_t             isize;
-        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+        /*
-                return EAGAIN;
+         * The transaction was allocated in the I/O submission thread,
+         * thus we need to mark ourselves as beeing in a transaction
+         * manually.
+         */
+        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        isize = xfs_ioend_new_eof(ioend);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        if (isize) {
+        isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
-                trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+        if (!isize) {
-                ip->i_d.di_size = isize;
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_mark_inode_dirty(ip);
+                xfs_trans_cancel(tp, 0);
+                return 0;
        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
-        return 0;
+        ip->i_d.di_size = isize;
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        return xfs_trans_commit(tp, 0);
 }
 /*
@@ -163,10 +180,12 @@ xfs_finish_ioend(
        struct xfs_ioend        *ioend)
 {
        if (atomic_dec_and_test(&ioend->io_remaining)) {
+                struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
                if (ioend->io_type == IO_UNWRITTEN)
-                        queue_work(xfsconvertd_workqueue, &ioend->io_work);
+                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-                else if (xfs_ioend_is_append(ioend))
+                else if (ioend->io_append_trans)
-                        queue_work(xfsdatad_workqueue, &ioend->io_work);
+                        queue_work(mp->m_data_workqueue, &ioend->io_work);
                else
                        xfs_destroy_ioend(ioend);
        }
@@ -195,35 +214,36 @@ xfs_end_io(
         * range to normal written extens after the data I/O has finished.
         */
        if (ioend->io_type == IO_UNWRITTEN) {
+                /*
+                 * For buffered I/O we never preallocate a transaction when
+                 * doing the unwritten extent conversion, but for direct I/O
+                 * we do not know if we are converting an unwritten extent
+                 * or not at the point where we preallocate the transaction.
+                 */
+                if (ioend->io_append_trans) {
+                        ASSERT(ioend->io_isdirect);
+                        current_set_flags_nested(
+                                &ioend->io_append_trans->t_pflags, PF_FSTRANS);
+                        xfs_trans_cancel(ioend->io_append_trans, 0);
+                }
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                 ioend->io_size);
                if (error) {
                        ioend->io_error = -error;
                        goto done;
                }
+        } else if (ioend->io_append_trans) {
+                error = xfs_setfilesize(ioend);
+                if (error)
+                        ioend->io_error = -error;
+        } else {
+                ASSERT(!xfs_ioend_is_append(ioend));
        }
-        /*
-         * We might have to update the on-disk file size after extending
-         * writes.
-         */
-        error = xfs_setfilesize(ioend);
-        ASSERT(!error || error == EAGAIN);
 done:
-        /*
+        xfs_destroy_ioend(ioend);
-         * If we didn't complete processing of the ioend, requeue it to the
-         * tail of the workqueue for another attempt later. Otherwise destroy
-         * it.
-         */
-        if (error == EAGAIN) {
-                atomic_inc(&ioend->io_remaining);
-                xfs_finish_ioend(ioend);
-                /* ensure we don't spin on blocked ioends */
-                delay(1);
-        } else {
-                xfs_destroy_ioend(ioend);
-        }
 }
 /*
@@ -259,6 +279,7 @@ xfs_alloc_ioend(
         */
        atomic_set(&ioend->io_remaining, 1);
        ioend->io_isasync = 0;
+        ioend->io_isdirect = 0;
        ioend->io_error = 0;
        ioend->io_list = NULL;
        ioend->io_type = type;
@@ -269,6 +290,7 @@ xfs_alloc_ioend(
        ioend->io_size = 0;
        ioend->io_iocb = NULL;
        ioend->io_result = 0;
+        ioend->io_append_trans = NULL;
        INIT_WORK(&ioend->io_work, xfs_end_io);
        return ioend;
@@ -379,14 +401,6 @@ xfs_submit_ioend_bio(
        atomic_inc(&ioend->io_remaining);
        bio->bi_private = ioend;
        bio->bi_end_io = xfs_end_bio;
-        /*
-         * If the I/O is beyond EOF we mark the inode dirty immediately
-         * but don't update the inode size until I/O completion.
-         */
-        if (xfs_ioend_new_eof(ioend))
-                xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
 }
@@ -1033,8 +1047,20 @@ xfs_vm_writepage(
                                  wbc, end_index);
        }
-        if (iohead)
+        if (iohead) {
+                /*
+                 * Reserve log space if we might write beyond the on-disk
+                 * inode size.
+                 */
+                if (ioend->io_type != IO_UNWRITTEN &&
+                    xfs_ioend_is_append(ioend)) {
+                        err = xfs_setfilesize_trans_alloc(ioend);
+                        if (err)
+                                goto error;
+                }
                xfs_submit_ioend(wbc, iohead);
+        }
        return 0;
@@ -1314,17 +1340,32 @@ xfs_vm_direct_IO(
 {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
+        struct xfs_ioend        *ioend = NULL;
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+                size_t size = iov_length(iov, nr_segs);
+                /*
+                 * We need to preallocate a transaction for a size update
+                 * here.  In the case that this write both updates the size
+                 * and converts at least on unwritten extent we will cancel
+                 * the still clean transaction after the I/O has finished.
+                 */
+                iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
+                if (offset + size > XFS_I(inode)->i_d.di_size) {
+                        ret = xfs_setfilesize_trans_alloc(ioend);
+                        if (ret)
+                                goto out_destroy_ioend;
+                        ioend->io_isdirect = 1;
+                }
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
                                            xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL, 0);
                if (ret != -EIOCBQUEUED && iocb->private)
-                        xfs_destroy_ioend(iocb->private);
+                        goto out_trans_cancel;
        } else {
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1333,6 +1374,16 @@ xfs_vm_direct_IO(
        }
        return ret;
+out_trans_cancel:
+        if (ioend->io_append_trans) {
+                current_set_flags_nested(&ioend->io_append_trans->t_pflags,
+                                         PF_FSTRANS);
+                xfs_trans_cancel(ioend->io_append_trans, 0);
+        }
+out_destroy_ioend:
+        xfs_destroy_ioend(ioend);
+        return ret;
 }
 STATIC void
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c37034..84eafbcb0d9d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,8 +18,6 @@
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
-extern struct workqueue_struct *xfsdatad_workqueue;
-extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
@@ -48,12 +46,14 @@ typedef struct xfs_ioend {
        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
        unsigned int            io_isasync : 1; /* needs aio_complete */
+        unsigned int            io_isdirect : 1;/* direct I/O */
        struct inode            *io_inode;      /* file being written to */
        struct buffer_head      *io_buffer_head;/* buffer linked list head */
        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
        struct work_struct      io_work;        /* xfsdatad work queue */
+        struct xfs_trans        *io_append_trans;/* xact. for size update */
        struct kiocb            *io_iocb;
        int                     io_result;
 } xfs_ioend_t;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 08b9ac644c31..65d61b948ead 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -853,6 +853,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
 {
        int newsize, forkoff, retval;
+        trace_xfs_attr_sf_addname(args);
        retval = xfs_attr_shortform_lookup(args);
        if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
                return(retval);
@@ -896,6 +898,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
        xfs_dabuf_t *bp;
        int retval, error, committed, forkoff;
+        trace_xfs_attr_leaf_addname(args);
        /*
         * Read the (only) block in the attribute list in.
         */
@@ -920,6 +924,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                        xfs_da_brelse(args->trans, bp);
                        return(retval);
                }
+                trace_xfs_attr_leaf_replace(args);
                args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
@@ -1090,6 +1097,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
        xfs_dabuf_t *bp;
        int error, committed, forkoff;
+        trace_xfs_attr_leaf_removename(args);
        /*
         * Remove the attribute.
         */
@@ -1223,6 +1232,8 @@ xfs_attr_node_addname(xfs_da_args_t *args)
        xfs_mount_t *mp;
        int committed, retval, error;
+        trace_xfs_attr_node_addname(args);
        /*
         * Fill in bucket of arguments/results/context to carry around.
         */
@@ -1249,6 +1260,9 @@ restart:
        } else if (retval == EEXIST) {
                if (args->flags & ATTR_CREATE)
                        goto out;
+                trace_xfs_attr_node_replace(args);
                args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
@@ -1480,6 +1494,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
        xfs_dabuf_t *bp;
        int retval, error, committed, forkoff;
+        trace_xfs_attr_node_removename(args);
        /*
         * Tie a string around our finger to remind us where we are.
         */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d25eafd4d28d..76d93dc953e1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -235,6 +235,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args)
        xfs_inode_t *dp;
        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_create(args);
        dp = args->dp;
        ASSERT(dp != NULL);
        ifp = dp->i_afp;
@@ -268,6 +270,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
        xfs_inode_t *dp;
        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_add(args);
        dp = args->dp;
        mp = dp->i_mount;
        dp->i_d.di_forkoff = forkoff;
@@ -337,6 +341,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
        xfs_mount_t *mp;
        xfs_inode_t *dp;
+        trace_xfs_attr_sf_remove(args);
        dp = args->dp;
        mp = dp->i_mount;
        base = sizeof(xfs_attr_sf_hdr_t);
@@ -405,6 +411,8 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
        int i;
        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_lookup(args);
        ifp = args->dp->i_afp;
        ASSERT(ifp->if_flags & XFS_IFINLINE);
        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -476,6 +484,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
        xfs_dabuf_t *bp;
        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_to_leaf(args);
        dp = args->dp;
        ifp = dp->i_afp;
        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -775,6 +785,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
        char *tmpbuffer;
        int error, i;
+        trace_xfs_attr_leaf_to_sf(args);
        dp = args->dp;
        tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
        ASSERT(tmpbuffer != NULL);
@@ -848,6 +860,8 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
        xfs_dablk_t blkno;
        int error;
+        trace_xfs_attr_leaf_to_node(args);
        dp = args->dp;
        bp1 = bp2 = NULL;
        error = xfs_da_grow_inode(args, &blkno);
@@ -911,6 +925,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
        xfs_dabuf_t *bp;
        int error;
+        trace_xfs_attr_leaf_create(args);
        dp = args->dp;
        ASSERT(dp != NULL);
        error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
@@ -948,6 +964,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
        xfs_dablk_t blkno;
        int error;
+        trace_xfs_attr_leaf_split(state->args);
        /*
         * Allocate space for a new leaf node.
         */
@@ -977,10 +995,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
         *
         * Insert the "new" entry in the correct block.
         */
-        if (state->inleaf)
+        if (state->inleaf) {
+                trace_xfs_attr_leaf_add_old(state->args);
                error = xfs_attr_leaf_add(oldblk->bp, state->args);
-        else
+        } else {
+                trace_xfs_attr_leaf_add_new(state->args);
                error = xfs_attr_leaf_add(newblk->bp, state->args);
+        }
        /*
         * Update last hashval in each block since we added the name.
@@ -1001,6 +1022,8 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
        xfs_attr_leaf_map_t *map;
        int tablesize, entsize, sum, tmp, i;
+        trace_xfs_attr_leaf_add(args);
        leaf = bp->data;
        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT((args->index >= 0)
@@ -1128,8 +1151,6 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
               (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
        /*
-         * Copy the attribute name and value into the new space.
-         *
         * For "remote" attribute values, simply note that we need to
         * allocate space for the "remote" value.  We can't actually
         * allocate the extents in this transaction, and we can't decide
@@ -1265,6 +1286,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
        ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        args = state->args;
+        trace_xfs_attr_leaf_rebalance(args);
        /*
         * Check ordering of blocks, reverse if it makes things simpler.
         *
@@ -1810,6 +1833,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
        xfs_mount_t *mp;
        char *tmpbuffer;
+        trace_xfs_attr_leaf_unbalance(state->args);
        /*
         * Set up environment.
         */
@@ -1919,6 +1944,8 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
        int probe, span;
        xfs_dahash_t hashval;
+        trace_xfs_attr_leaf_lookup(args);
        leaf = bp->data;
        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(be16_to_cpu(leaf->hdr.count)
@@ -2445,6 +2472,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
        char *name;
 #endif /* DEBUG */
+        trace_xfs_attr_leaf_clearflag(args);
        /*
         * Set up the operation.
         */
@@ -2509,6 +2537,8 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
        xfs_dabuf_t *bp;
        int error;
+        trace_xfs_attr_leaf_setflag(args);
        /*
         * Set up the operation.
         */
@@ -2565,6 +2595,8 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
        char *name1, *name2;
 #endif /* DEBUG */
+        trace_xfs_attr_leaf_flipflags(args);
        /*
         * Read the block containing the "old" attr
         */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 188ef2fbd628..85e7e327bcd8 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5124,6 +5124,15 @@ xfs_bunmapi(
                cur->bc_private.b.flags = 0;
        } else
                cur = NULL;
+        if (isrt) {
+                /*
+                 * Synchronize by locking the bitmap inode.
+                 */
+                xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+        }
        extno = 0;
        while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
               (nexts == 0 || extno < nexts)) {
@@ -5536,8 +5545,12 @@ xfs_getbmap(
        if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
                return XFS_ERROR(ENOMEM);
        out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
-        if (!out)
+        if (!out) {
-                return XFS_ERROR(ENOMEM);
+                out = kmem_zalloc_large(bmv->bmv_count *
+                                        sizeof(struct getbmapx));
+                if (!out)
+                        return XFS_ERROR(ENOMEM);
+        }
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
@@ -5661,7 +5674,10 @@ xfs_getbmap(
                        break;
        }
-        kmem_free(out);
+        if (is_vmalloc_addr(out))
+                kmem_free_large(out);
+        else
+                kmem_free(out);
        return error;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4dff85c7d7eb..6819b5163e33 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -45,8 +45,6 @@ static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
 static struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-struct workqueue_struct *xfsconvertd_workqueue;
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)       ((bp)->b_last_holder = current->pid)
@@ -1793,21 +1791,8 @@ xfs_buf_init(void)
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
-        if (!xfsdatad_workqueue)
-                goto out_destroy_xfslogd_workqueue;
-        xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
-                                                WQ_MEM_RECLAIM, 1);
-        if (!xfsconvertd_workqueue)
-                goto out_destroy_xfsdatad_workqueue;
        return 0;
- out_destroy_xfsdatad_workqueue:
-        destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
-        destroy_workqueue(xfslogd_workqueue);
 out_free_buf_zone:
        kmem_zone_destroy(xfs_buf_zone);
 out:
@@ -1817,8 +1802,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        destroy_workqueue(xfsconvertd_workqueue);
-        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
 }
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 77c74257c2a3..7f1a6f5b05a6 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -108,6 +108,8 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
        int error;
        xfs_trans_t *tp;
+        trace_xfs_da_node_create(args);
        tp = args->trans;
        error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
        if (error)
@@ -140,6 +142,8 @@ xfs_da_split(xfs_da_state_t *state)
        xfs_dabuf_t *bp;
        int max, action, error, i;
+        trace_xfs_da_split(state->args);
        /*
         * Walk back up the tree splitting/inserting/adjusting as necessary.
         * If we need to insert and there isn't room, split the node, then
@@ -178,10 +182,12 @@ xfs_da_split(xfs_da_state_t *state)
                        state->extravalid = 1;
                        if (state->inleaf) {
                                state->extraafter = 0;  /* before newblk */
+                                trace_xfs_attr_leaf_split_before(state->args);
                                error = xfs_attr_leaf_split(state, oldblk,
                                                            &state->extrablk);
                        } else {
                                state->extraafter = 1;  /* after newblk */
+                                trace_xfs_attr_leaf_split_after(state->args);
                                error = xfs_attr_leaf_split(state, newblk,
                                                            &state->extrablk);
                        }
@@ -300,6 +306,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
        xfs_mount_t *mp;
        xfs_dir2_leaf_t *leaf;
+        trace_xfs_da_root_split(state->args);
        /*
         * Copy the existing (incorrect) block from the root node position
         * to a free space somewhere.
@@ -380,6 +388,8 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
        int newcount, error;
        int useextra;
+        trace_xfs_da_node_split(state->args);
        node = oldblk->bp->data;
        ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
@@ -466,6 +476,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
        int count, tmp;
        xfs_trans_t *tp;
+        trace_xfs_da_node_rebalance(state->args);
        node1 = blk1->bp->data;
        node2 = blk2->bp->data;
        /*
@@ -574,6 +586,8 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
        xfs_da_node_entry_t *btree;
        int tmp;
+        trace_xfs_da_node_add(state->args);
        node = oldblk->bp->data;
        ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
        ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
@@ -619,6 +633,8 @@ xfs_da_join(xfs_da_state_t *state)
        xfs_da_state_blk_t *drop_blk, *save_blk;
        int action, error;
+        trace_xfs_da_join(state->args);
        action = 0;
        drop_blk = &state->path.blk[ state->path.active-1 ];
        save_blk = &state->altpath.blk[ state->path.active-1 ];
@@ -723,6 +739,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
        xfs_dabuf_t *bp;
        int error;
+        trace_xfs_da_root_join(state->args);
        args = state->args;
        ASSERT(args != NULL);
        ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
@@ -941,6 +959,8 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
        xfs_da_node_entry_t *btree;
        int tmp;
+        trace_xfs_da_node_remove(state->args);
        node = drop_blk->bp->data;
        ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
        ASSERT(drop_blk->index >= 0);
@@ -984,6 +1004,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
        int tmp;
        xfs_trans_t *tp;
+        trace_xfs_da_node_unbalance(state->args);
        drop_node = drop_blk->bp->data;
        save_node = save_blk->bp->data;
        ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
@@ -1230,6 +1252,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                /*
                 * Link new block in before existing block.
                 */
+                trace_xfs_da_link_before(args);
                new_info->forw = cpu_to_be32(old_blk->blkno);
                new_info->back = old_info->back;
                if (old_info->back) {
@@ -1251,6 +1274,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                /*
                 * Link new block in after existing block.
                 */
+                trace_xfs_da_link_after(args);
                new_info->forw = old_info->forw;
                new_info->back = cpu_to_be32(old_blk->blkno);
                if (old_info->forw) {
@@ -1348,6 +1372,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
         * Unlink the leaf block from the doubly linked chain of leaves.
         */
        if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+                trace_xfs_da_unlink_back(args);
                save_info->back = drop_info->back;
                if (drop_info->back) {
                        error = xfs_da_read_buf(args->trans, args->dp,
@@ -1365,6 +1390,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                        xfs_da_buf_done(bp);
                }
        } else {
+                trace_xfs_da_unlink_forward(args);
                save_info->forw = drop_info->forw;
                if (drop_info->forw) {
                        error = xfs_da_read_buf(args->trans, args->dp,
@@ -1652,6 +1678,8 @@ xfs_da_grow_inode(
        int                     count;
        int                     error;
+        trace_xfs_da_grow_inode(args);
        if (args->whichfork == XFS_DATA_FORK) {
                bno = args->dp->i_mount->m_dirleafblk;
                count = args->dp->i_mount->m_dirblkfsbs;
@@ -1690,6 +1718,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
        xfs_dir2_leaf_t *dead_leaf2;
        xfs_dahash_t dead_hash;
+        trace_xfs_da_swap_lastblock(args);
        dead_buf = *dead_bufp;
        dead_blkno = *dead_blknop;
        tp = args->trans;
@@ -1878,6 +1908,8 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
        xfs_trans_t *tp;
        xfs_mount_t *mp;
+        trace_xfs_da_shrink_inode(args);
        dp = args->dp;
        w = args->whichfork;
        tp = args->trans;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index dd974a55c77d..1137bbc5eccb 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -215,7 +215,7 @@ xfs_swap_extents(
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
-        int             ilf_fields, tilf_fields;
+        int             src_log_flags, target_log_flags;
        int             error = 0;
        int             aforkblks = 0;
        int             taforkblks = 0;
@@ -385,9 +385,8 @@ xfs_swap_extents(
        tip->i_delayed_blks = ip->i_delayed_blks;
        ip->i_delayed_blks = 0;
-        ilf_fields = XFS_ILOG_CORE;
+        src_log_flags = XFS_ILOG_CORE;
+        switch (ip->i_d.di_format) {
-        switch(ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
                /* If the extents fit in the inode, fix the
                 * pointer.  Otherwise it's already NULL or
@@ -397,16 +396,15 @@ xfs_swap_extents(
                        ifp->if_u1.if_extents =
                                ifp->if_u2.if_inline_ext;
                }
-                ilf_fields |= XFS_ILOG_DEXT;
+                src_log_flags |= XFS_ILOG_DEXT;
                break;
        case XFS_DINODE_FMT_BTREE:
-                ilf_fields |= XFS_ILOG_DBROOT;
+                src_log_flags |= XFS_ILOG_DBROOT;
                break;
        }
-        tilf_fields = XFS_ILOG_CORE;
+        target_log_flags = XFS_ILOG_CORE;
+        switch (tip->i_d.di_format) {
-        switch(tip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
                /* If the extents fit in the inode, fix the
                 * pointer.  Otherwise it's already NULL or
@@ -416,10 +414,10 @@ xfs_swap_extents(
                        tifp->if_u1.if_extents =
                                tifp->if_u2.if_inline_ext;
                }
-                tilf_fields |= XFS_ILOG_DEXT;
+                target_log_flags |= XFS_ILOG_DEXT;
                break;
        case XFS_DINODE_FMT_BTREE:
-                tilf_fields |= XFS_ILOG_DBROOT;
+                target_log_flags |= XFS_ILOG_DBROOT;
                break;
        }
@@ -427,8 +425,8 @@ xfs_swap_extents(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_trans_log_inode(tp, ip,  ilf_fields);
+        xfs_trans_log_inode(tp, ip,  src_log_flags);
-        xfs_trans_log_inode(tp, tip, tilf_fields);
+        xfs_trans_log_inode(tp, tip, target_log_flags);
        /*
         * If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 9245e029b8ea..d3b63aefd01d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 286a051f12cf..1ad3a4b8ca40 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -37,9 +37,9 @@ STATIC int
 xfs_trim_extents(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
-        xfs_fsblock_t           start,
+        xfs_daddr_t             start,
-        xfs_fsblock_t           end,
+        xfs_daddr_t             end,
-        xfs_fsblock_t           minlen,
+        xfs_daddr_t             minlen,
        __uint64_t              *blocks_trimmed)
 {
        struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
@@ -67,7 +67,7 @@ xfs_trim_extents(
        /*
         * Look up the longest btree in the AGF and start with it.
         */
-        error = xfs_alloc_lookup_le(cur, 0,
+        error = xfs_alloc_lookup_ge(cur, 0,
                            be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
        if (error)
                goto out_del_cursor;
@@ -77,8 +77,10 @@ xfs_trim_extents(
         * enough to be worth discarding.
         */
        while (i) {
-                xfs_agblock_t fbno;
+                xfs_agblock_t   fbno;
-                xfs_extlen_t flen;
+                xfs_extlen_t    flen;
+                xfs_daddr_t     dbno;
+                xfs_extlen_t    dlen;
                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
                if (error)
@@ -87,9 +89,17 @@ xfs_trim_extents(
                ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
                /*
+                 * use daddr format for all range/len calculations as that is
+                 * the format the range/len variables are supplied in by
+                 * userspace.
+                 */
+                dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
+                dlen = XFS_FSB_TO_BB(mp, flen);
+                /*
                 * Too small?  Give up.
                 */
-                if (flen < minlen) {
+                if (dlen < minlen) {
                        trace_xfs_discard_toosmall(mp, agno, fbno, flen);
                        goto out_del_cursor;
                }
@@ -99,8 +109,7 @@ xfs_trim_extents(
                 * supposed to discard skip it.  Do not bother to trim
                 * down partially overlapping ranges for now.
                 */
-                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                if (dbno + dlen < start || dbno > end) {
-                    XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
                        goto next_extent;
                }
@@ -115,10 +124,7 @@ xfs_trim_extents(
                }
                trace_xfs_discard_extent(mp, agno, fbno, flen);
-                error = -blkdev_issue_discard(bdev,
+                error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
-                                XFS_AGB_TO_DADDR(mp, agno, fbno),
-                                XFS_FSB_TO_BB(mp, flen),
-                                GFP_NOFS, 0);
                if (error)
                        goto out_del_cursor;
                *blocks_trimmed += flen;
@@ -137,6 +143,15 @@ out_put_perag:
        return error;
 }
+/*
+ * trim a range of the filesystem.
+ *
+ * Note: the parameters passed from userspace are byte ranges into the
+ * filesystem which does not match to the format we use for filesystem block
+ * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
+ * is a linear address range. Hence we need to use DADDR based conversions and
+ * comparisons for determining the correct offset and regions to trim.
+ */
 int
 xfs_ioc_trim(
        struct xfs_mount                *mp,
@@ -145,7 +160,7 @@ xfs_ioc_trim(
        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
        unsigned int            granularity = q->limits.discard_granularity;
        struct fstrim_range     range;
-        xfs_fsblock_t           start, end, minlen;
+        xfs_daddr_t             start, end, minlen;
        xfs_agnumber_t          start_agno, end_agno, agno;
        __uint64_t              blocks_trimmed = 0;
        int                     error, last_error = 0;
@@ -159,22 +174,22 @@ xfs_ioc_trim(
        /*
         * Truncating down the len isn't actually quite correct, but using
-         * XFS_B_TO_FSB would mean we trivially get overflows for values
+         * BBTOB would mean we trivially get overflows for values
         * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
         * used by the fstrim application.  In the end it really doesn't
         * matter as trimming blocks is an advisory interface.
         */
-        start = XFS_B_TO_FSBT(mp, range.start);
+        start = BTOBB(range.start);
-        end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
+        end = start + BTOBBT(range.len) - 1;
-        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+        minlen = BTOBB(max_t(u64, granularity, range.minlen));
-        if (start >= mp->m_sb.sb_dblocks)
+        if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
                return -XFS_ERROR(EINVAL);
-        if (end > mp->m_sb.sb_dblocks - 1)
+        if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
-                end = mp->m_sb.sb_dblocks - 1;
+                end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
-        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        start_agno = xfs_daddr_to_agno(mp, start);
-        end_agno = XFS_FSB_TO_AGNO(mp, end);
+        end_agno = xfs_daddr_to_agno(mp, end);
        for (agno = start_agno; agno <= end_agno; agno++) {
                error = -xfs_trim_extents(mp, agno, start, end, minlen,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 53db20ee3e77..1155208fa830 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -43,11 +43,10 @@
 * Lock order:
 *
 * ip->i_lock
- *   qh->qh_lock
+ *   qi->qi_tree_lock
- *     qi->qi_dqlist_lock
+ *     dquot->q_qlock (xfs_dqlock() and friends)
- *       dquot->q_qlock (xfs_dqlock() and friends)
+ *       dquot->q_flush (xfs_dqflock() and friends)
- *         dquot->q_flush (xfs_dqflock() and friends)
+ *       qi->qi_lru_lock
- *         xfs_Gqm->qm_dqfrlist_lock
 *
 * If two dquots need to be locked the order is user before group/project,
 * otherwise by the lowest id first, see xfs_dqlock2.
@@ -60,6 +59,9 @@ int xfs_dqreq_num;
 int xfs_dqerror_mod = 33;
 #endif
+struct kmem_zone                *xfs_qm_dqtrxzone;
+static struct kmem_zone         *xfs_qm_dqzone;
 static struct lock_class_key xfs_dquot_other_class;
 /*
@@ -69,12 +71,12 @@ void
 xfs_qm_dqdestroy(
        xfs_dquot_t     *dqp)
 {
-        ASSERT(list_empty(&dqp->q_freelist));
+        ASSERT(list_empty(&dqp->q_lru));
        mutex_destroy(&dqp->q_qlock);
-        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+        kmem_zone_free(xfs_qm_dqzone, dqp);
-        atomic_dec(&xfs_Gqm->qm_totaldquots);
+        XFS_STATS_DEC(xs_qm_dquot);
 }
 /*
@@ -282,7 +284,7 @@ xfs_qm_dqalloc(
         * Return if this type of quotas is turned off while we didn't
         * have an inode lock
         */
-        if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+        if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                xfs_iunlock(quotip, XFS_ILOCK_EXCL);
                return (ESRCH);
        }
@@ -384,7 +386,7 @@ xfs_qm_dqtobp(
        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
        xfs_ilock(quotip, XFS_ILOCK_SHARED);
-        if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+        if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                /*
                 * Return if this type of quotas is turned off while we
                 * didn't have the quota inode lock.
@@ -492,12 +494,12 @@ xfs_qm_dqread(
        int                     cancelflags = 0;
-        dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+        dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
        dqp->dq_flags = type;
        dqp->q_core.d_id = cpu_to_be32(id);
        dqp->q_mount = mp;
-        INIT_LIST_HEAD(&dqp->q_freelist);
+        INIT_LIST_HEAD(&dqp->q_lru);
        mutex_init(&dqp->q_qlock);
        init_waitqueue_head(&dqp->q_pinwait);
@@ -516,7 +518,7 @@ xfs_qm_dqread(
        if (!(type & XFS_DQ_USER))
                lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
-        atomic_inc(&xfs_Gqm->qm_totaldquots);
+        XFS_STATS_INC(xs_qm_dquot);
        trace_xfs_dqread(dqp);
@@ -602,60 +604,6 @@ error0:
 }
 /*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
-        xfs_mount_t             *mp,
-        xfs_dqid_t              id,
-        xfs_dqhash_t            *qh,
-        xfs_dquot_t             **O_dqpp)
-{
-        xfs_dquot_t             *dqp;
-        ASSERT(mutex_is_locked(&qh->qh_lock));
-        /*
-         * Traverse the hashchain looking for a match
-         */
-        list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
-                /*
-                 * We already have the hashlock. We don't need the
-                 * dqlock to look at the id field of the dquot, since the
-                 * id can't be modified without the hashlock anyway.
-                 */
-                if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp)
-                        continue;
-                trace_xfs_dqlookup_found(dqp);
-                xfs_dqlock(dqp);
-                if (dqp->dq_flags & XFS_DQ_FREEING) {
-                        *O_dqpp = NULL;
-                        xfs_dqunlock(dqp);
-                        return -1;
-                }
-                dqp->q_nrefs++;
-                /*
-                 * move the dquot to the front of the hashchain
-                 */
-                list_move(&dqp->q_hashlist, &qh->qh_list);
-                trace_xfs_dqlookup_done(dqp);
-                *O_dqpp = dqp;
-                return 0;
-        }
-        *O_dqpp = NULL;
-        return 1;
-}
-/*
 * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
 * a locked dquot, doing an allocation (if requested) as needed.
 * When both an inode and an id are given, the inode's id takes precedence.
@@ -672,10 +620,10 @@ xfs_qm_dqget(
        uint            flags,    /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
        xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-        xfs_dqhash_t    *h;
+        struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
-        uint            version;
+        struct xfs_dquot        *dqp;
-        int             error;
+        int                     error;
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
@@ -683,7 +631,6 @@ xfs_qm_dqget(
            (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
                return (ESRCH);
        }
-        h = XFS_DQ_HASH(mp, id, type);
 #ifdef DEBUG
        if (xfs_do_dqerror) {
@@ -699,42 +646,33 @@ xfs_qm_dqget(
               type == XFS_DQ_GROUP);
        if (ip) {
                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-                if (type == XFS_DQ_USER)
+                ASSERT(xfs_inode_dquot(ip, type) == NULL);
-                        ASSERT(ip->i_udquot == NULL);
-                else
-                        ASSERT(ip->i_gdquot == NULL);
        }
 #endif
 restart:
-        mutex_lock(&h->qh_lock);
+        mutex_lock(&qi->qi_tree_lock);
+        dqp = radix_tree_lookup(tree, id);
+        if (dqp) {
+                xfs_dqlock(dqp);
+                if (dqp->dq_flags & XFS_DQ_FREEING) {
+                        xfs_dqunlock(dqp);
+                        mutex_unlock(&qi->qi_tree_lock);
+                        trace_xfs_dqget_freeing(dqp);
+                        delay(1);
+                        goto restart;
+                }
-        /*
+                dqp->q_nrefs++;
-         * Look in the cache (hashtable).
+                mutex_unlock(&qi->qi_tree_lock);
-         * The chain is kept locked during lookup.
-         */
+                trace_xfs_dqget_hit(dqp);
-        switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) {
+                XFS_STATS_INC(xs_qm_dqcachehits);
-        case -1:
+                *O_dqpp = dqp;
-                XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
+                return 0;
-                mutex_unlock(&h->qh_lock);
-                delay(1);
-                goto restart;
-        case 0:
-                XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
-                /*
-                 * The dquot was found, moved to the front of the chain,
-                 * taken off the freelist if it was on it, and locked
-                 * at this point. Just unlock the hashchain and return.
-                 */
-                ASSERT(*O_dqpp);
-                ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-                mutex_unlock(&h->qh_lock);
-                trace_xfs_dqget_hit(*O_dqpp);
-                return 0;       /* success */
-        default:
-                XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
-                break;
        }
+        mutex_unlock(&qi->qi_tree_lock);
+        XFS_STATS_INC(xs_qm_dqcachemisses);
        /*
         * Dquot cache miss. We don't want to keep the inode lock across
@@ -745,12 +683,6 @@ restart:
         */
        if (ip)
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        /*
-         * Save the hashchain version stamp, and unlock the chain, so that
-         * we don't keep the lock across a disk read
-         */
-        version = h->qh_version;
-        mutex_unlock(&h->qh_lock);
        error = xfs_qm_dqread(mp, id, type, flags, &dqp);
@@ -760,97 +692,53 @@ restart:
        if (error)
                return error;
-        /*
-         * Dquot lock comes after hashlock in the lock ordering
-         */
        if (ip) {
                /*
                 * A dquot could be attached to this inode by now, since
                 * we had dropped the ilock.
                 */
-                if (type == XFS_DQ_USER) {
+                if (xfs_this_quota_on(mp, type)) {
-                        if (!XFS_IS_UQUOTA_ON(mp)) {
+                        struct xfs_dquot        *dqp1;
-                                /* inode stays locked on return */
-                                xfs_qm_dqdestroy(dqp);
+                        dqp1 = xfs_inode_dquot(ip, type);
-                                return XFS_ERROR(ESRCH);
+                        if (dqp1) {
-                        }
-                        if (ip->i_udquot) {
                                xfs_qm_dqdestroy(dqp);
-                                dqp = ip->i_udquot;
+                                dqp = dqp1;
                                xfs_dqlock(dqp);
                                goto dqret;
                        }
                } else {
-                        if (!XFS_IS_OQUOTA_ON(mp)) {
+                        /* inode stays locked on return */
-                                /* inode stays locked on return */
+                        xfs_qm_dqdestroy(dqp);
-                                xfs_qm_dqdestroy(dqp);
+                        return XFS_ERROR(ESRCH);
-                                return XFS_ERROR(ESRCH);
-                        }
-                        if (ip->i_gdquot) {
-                                xfs_qm_dqdestroy(dqp);
-                                dqp = ip->i_gdquot;
-                                xfs_dqlock(dqp);
-                                goto dqret;
-                        }
                }
        }
-        /*
+        mutex_lock(&qi->qi_tree_lock);
-         * Hashlock comes after ilock in lock order
+        error = -radix_tree_insert(tree, id, dqp);
-         */
+        if (unlikely(error)) {
-        mutex_lock(&h->qh_lock);
+                WARN_ON(error != EEXIST);
-        if (version != h->qh_version) {
-                xfs_dquot_t *tmpdqp;
                /*
-                 * Now, see if somebody else put the dquot in the
+                 * Duplicate found. Just throw away the new dquot and start
-                 * hashtable before us. This can happen because we didn't
+                 * over.
-                 * keep the hashchain lock. We don't have to worry about
-                 * lock order between the two dquots here since dqp isn't
-                 * on any findable lists yet.
                 */
-                switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) {
+                mutex_unlock(&qi->qi_tree_lock);
-                case 0:
+                trace_xfs_dqget_dup(dqp);
-                case -1:
+                xfs_qm_dqdestroy(dqp);
-                        /*
+                XFS_STATS_INC(xs_qm_dquot_dups);
-                         * Duplicate found, either in cache or on its way out.
+                goto restart;
-                         * Just throw away the new dquot and start over.
-                         */
-                        if (tmpdqp)
-                                xfs_qm_dqput(tmpdqp);
-                        mutex_unlock(&h->qh_lock);
-                        xfs_qm_dqdestroy(dqp);
-                        XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
-                        goto restart;
-                default:
-                        break;
-                }
        }
        /*
-         * Put the dquot at the beginning of the hash-chain and mp's list
-         * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
-         */
-        ASSERT(mutex_is_locked(&h->qh_lock));
-        dqp->q_hash = h;
-        list_add(&dqp->q_hashlist, &h->qh_list);
-        h->qh_version++;
-        /*
-         * Attach this dquot to this filesystem's list of all dquots,
-         * kept inside the mount structure in m_quotainfo field
-         */
-        mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-        /*
         * We return a locked dquot to the caller, with a reference taken
         */
        xfs_dqlock(dqp);
        dqp->q_nrefs = 1;
-        list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+        qi->qi_dquots++;
-        mp->m_quotainfo->qi_dquots++;
+        mutex_unlock(&qi->qi_tree_lock);
-        mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-        mutex_unlock(&h->qh_lock);
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        trace_xfs_dqget_miss(dqp);
@@ -859,37 +747,22 @@ restart:
 }
-/*
+STATIC void
- * Release a reference to the dquot (decrement ref-count)
+xfs_qm_dqput_final(
- * and unlock it. If there is a group quota attached to this
- * dquot, carefully release that too without tripping over
- * deadlocks'n'stuff.
- */
-void
-xfs_qm_dqput(
        struct xfs_dquot        *dqp)
 {
+        struct xfs_quotainfo    *qi = dqp->q_mount->m_quotainfo;
        struct xfs_dquot        *gdqp;
-        ASSERT(dqp->q_nrefs > 0);
-        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        trace_xfs_dqput(dqp);
-recurse:
-        if (--dqp->q_nrefs > 0) {
-                xfs_dqunlock(dqp);
-                return;
-        }
        trace_xfs_dqput_free(dqp);
-        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+        mutex_lock(&qi->qi_lru_lock);
-        if (list_empty(&dqp->q_freelist)) {
+        if (list_empty(&dqp->q_lru)) {
-                list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+                list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
-                xfs_Gqm->qm_dqfrlist_cnt++;
+                qi->qi_lru_count++;
+                XFS_STATS_INC(xs_qm_dquot_unused);
        }
-        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+        mutex_unlock(&qi->qi_lru_lock);
        /*
         * If we just added a udquot to the freelist, then we want to release
@@ -906,10 +779,29 @@ recurse:
        /*
         * If we had a group quota hint, release it now.
         */
-        if (gdqp) {
+        if (gdqp)
-                dqp = gdqp;
+                xfs_qm_dqput(gdqp);
-                goto recurse;
+}
-        }
+/*
+ * Release a reference to the dquot (decrement ref-count) and unlock it.
+ *
+ * If there is a group quota attached to this dquot, carefully release that
+ * too without tripping over deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+        struct xfs_dquot        *dqp)
+{
+        ASSERT(dqp->q_nrefs > 0);
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        trace_xfs_dqput(dqp);
+        if (--dqp->q_nrefs > 0)
+                xfs_dqunlock(dqp);
+        else
+                xfs_qm_dqput_final(dqp);
 }
 /*
@@ -1091,17 +983,6 @@ xfs_qm_dqflush(
 }
-void
-xfs_dqunlock(
-        xfs_dquot_t *dqp)
-{
-        xfs_dqunlock_nonotify(dqp);
-        if (dqp->q_logitem.qli_dquot == dqp) {
-                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
-                                        &dqp->q_logitem.qli_item);
-        }
-}
 /*
 * Lock two xfs_dquot structures.
 *
@@ -1131,85 +1012,6 @@ xfs_dqlock2(
 }
 /*
- * Take a dquot out of the mount's dqlist as well as the hashlist.  This is
- * called via unmount as well as quotaoff, and the purge will always succeed.
- */
-void
-xfs_qm_dqpurge(
-        struct xfs_dquot        *dqp)
-{
-        struct xfs_mount        *mp = dqp->q_mount;
-        struct xfs_dqhash       *qh = dqp->q_hash;
-        xfs_dqlock(dqp);
-        /*
-         * If we're turning off quotas, we have to make sure that, for
-         * example, we don't delete quota disk blocks while dquots are
-         * in the process of getting written to those disk blocks.
-         * This dquot might well be on AIL, and we can't leave it there
-         * if we're turning off quotas. Basically, we need this flush
-         * lock, and are willing to block on it.
-         */
-        if (!xfs_dqflock_nowait(dqp)) {
-                /*
-                 * Block on the flush lock after nudging dquot buffer,
-                 * if it is incore.
-                 */
-                xfs_dqflock_pushbuf_wait(dqp);
-        }
-        /*
-         * If we are turning this type of quotas off, we don't care
-         * about the dirty metadata sitting in this dquot. OTOH, if
-         * we're unmounting, we do care, so we flush it and wait.
-         */
-        if (XFS_DQ_IS_DIRTY(dqp)) {
-                int     error;
-                /*
-                 * We don't care about getting disk errors here. We need
-                 * to purge this dquot anyway, so we go ahead regardless.
-                 */
-                error = xfs_qm_dqflush(dqp, SYNC_WAIT);
-                if (error)
-                        xfs_warn(mp, "%s: dquot %p flush failed",
-                                __func__, dqp);
-                xfs_dqflock(dqp);
-        }
-        ASSERT(atomic_read(&dqp->q_pincount) == 0);
-        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
-               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-        xfs_dqfunlock(dqp);
-        xfs_dqunlock(dqp);
-        mutex_lock(&qh->qh_lock);
-        list_del_init(&dqp->q_hashlist);
-        qh->qh_version++;
-        mutex_unlock(&qh->qh_lock);
-        mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-        list_del_init(&dqp->q_mplist);
-        mp->m_quotainfo->qi_dqreclaims++;
-        mp->m_quotainfo->qi_dquots--;
-        mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-        /*
-         * We move dquots to the freelist as soon as their reference count
-         * hits zero, so it really should be on the freelist here.
-         */
-        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        ASSERT(!list_empty(&dqp->q_freelist));
-        list_del_init(&dqp->q_freelist);
-        xfs_Gqm->qm_dqfrlist_cnt--;
-        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-        xfs_qm_dqdestroy(dqp);
-}
-/*
 * Give the buffer a little push if it is incore and
 * wait on the flush lock.
 */
@@ -1241,3 +1043,31 @@ xfs_dqflock_pushbuf_wait(
 out_lock:
        xfs_dqflock(dqp);
 }
+int __init
+xfs_qm_init(void)
+{
+        xfs_qm_dqzone =
+                kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+        if (!xfs_qm_dqzone)
+                goto out;
+        xfs_qm_dqtrxzone =
+                kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+        if (!xfs_qm_dqtrxzone)
+                goto out_free_dqzone;
+        return 0;
+out_free_dqzone:
+        kmem_zone_destroy(xfs_qm_dqzone);
+out:
+        return -ENOMEM;
+}
+void
+xfs_qm_exit(void)
+{
+        kmem_zone_destroy(xfs_qm_dqtrxzone);
+        kmem_zone_destroy(xfs_qm_dqzone);
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index a1d91d8f1802..ef9190bd8b30 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -29,16 +29,6 @@
 * when quotas are off.
 */
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
-        struct list_head  qh_list;
-        struct mutex      qh_lock;
-        uint              qh_version;   /* ever increasing version */
-        uint              qh_nelems;    /* number of dquots on the list */
-} xfs_dqhash_t;
 struct xfs_mount;
 struct xfs_trans;
@@ -47,10 +37,7 @@ struct xfs_trans;
 */
 typedef struct xfs_dquot {
        uint             dq_flags;      /* various flags (XFS_DQ_*) */
-        struct list_head q_freelist;    /* global free list of dquots */
+        struct list_head q_lru;         /* global free list of dquots */
-        struct list_head q_mplist;      /* mount's list of dquots */
-        struct list_head q_hashlist;    /* gloabl hash list of dquots */
-        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        struct xfs_mount*q_mount;       /* filesystem this relates to */
        struct xfs_trans*q_transp;      /* trans this belongs to currently */
        uint             q_nrefs;       /* # active refs from inodes */
@@ -110,11 +97,37 @@ static inline void xfs_dqlock(struct xfs_dquot *dqp)
        mutex_lock(&dqp->q_qlock);
 }
-static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
+static inline void xfs_dqunlock(struct xfs_dquot *dqp)
 {
        mutex_unlock(&dqp->q_qlock);
 }
+static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+{
+        switch (type & XFS_DQ_ALLTYPES) {
+        case XFS_DQ_USER:
+                return XFS_IS_UQUOTA_ON(mp);
+        case XFS_DQ_GROUP:
+        case XFS_DQ_PROJ:
+                return XFS_IS_OQUOTA_ON(mp);
+        default:
+                return 0;
+        }
+}
+static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+{
+        switch (type & XFS_DQ_ALLTYPES) {
+        case XFS_DQ_USER:
+                return ip->i_udquot;
+        case XFS_DQ_GROUP:
+        case XFS_DQ_PROJ:
+                return ip->i_gdquot;
+        default:
+                return NULL;
+        }
+}
 #define XFS_DQ_IS_LOCKED(dqp)   (mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
@@ -125,15 +138,10 @@ static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
                                 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
                                 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
-#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
-                                     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
-                                     (XFS_IS_OQUOTA_ON((d)->q_mount))))
 extern int              xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
                                        uint, struct xfs_dquot  **);
 extern void             xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int              xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern void             xfs_qm_dqpurge(xfs_dquot_t *);
 extern void             xfs_qm_dqunpin_wait(xfs_dquot_t *);
 extern void             xfs_qm_adjust_dqtimers(xfs_mount_t *,
                                        xfs_disk_dquot_t *);
@@ -144,7 +152,6 @@ extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
 extern void             xfs_qm_dqput(xfs_dquot_t *);
 extern void             xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
-extern void             xfs_dqunlock(struct xfs_dquot *);
 extern void             xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
 static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7e5bc872f2b4..54a67dd9ac0a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -163,7 +163,6 @@ xfs_file_fsync(
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
        xfs_lsn_t               lsn = 0;
@@ -194,75 +193,18 @@ xfs_file_fsync(
        }
        /*
-         * We always need to make sure that the required inode state is safe on
+         * All metadata updates are logged, which means that we just have
-         * disk.  The inode might be clean but we still might need to force the
+         * to flush the log up to the latest LSN that touched the inode.
-         * log because of committed transactions that haven't hit the disk yet.
-         * Likewise, there could be unflushed non-transactional changes to the
-         * inode core that have to go to disk and this requires us to issue
-         * a synchronous transaction to capture these changes correctly.
-         *
-         * This code relies on the assumption that if the i_update_core field
-         * of the inode is clear and the inode is unpinned then it is clean
-         * and no action is required.
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        if (xfs_ipincount(ip)) {
-        /*
+                if (!datasync ||
-         * First check if the VFS inode is marked dirty.  All the dirtying
+                    (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
-         * of non-transactional updates do not go through mark_inode_dirty*,
-         * which allows us to distinguish between pure timestamp updates
-         * and i_size updates which need to be caught for fdatasync.
-         * After that also check for the dirty state in the XFS inode, which
-         * might gets cleared when the inode gets written out via the AIL
-         * or xfs_iflush_cluster.
-         */
-        if (((inode->i_state & I_DIRTY_DATASYNC) ||
-            ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
-            ip->i_update_core) {
-                /*
-                 * Kick off a transaction to log the inode core to get the
-                 * updates.  The sync transaction will also force the log.
-                 */
-                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-                error = xfs_trans_reserve(tp, 0,
-                                XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
-                if (error) {
-                        xfs_trans_cancel(tp, 0);
-                        return -error;
-                }
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                /*
-                 * Note - it's possible that we might have pushed ourselves out
-                 * of the way during trans_reserve which would flush the inode.
-                 * But there's no guarantee that the inode buffer has actually
-                 * gone out yet (it's delwri).  Plus the buffer could be pinned
-                 * anyway if it's part of an inode in another recent
-                 * transaction.  So we play it safe and fire off the
-                 * transaction anyway.
-                 */
-                xfs_trans_ijoin(tp, ip, 0);
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                error = xfs_trans_commit(tp, 0);
-                lsn = ip->i_itemp->ili_last_lsn;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        } else {
-                /*
-                 * Timestamps/size haven't changed since last inode flush or
-                 * inode transaction commit.  That means either nothing got
-                 * written or a transaction committed which caught the updates.
-                 * If the latter happened and the transaction hasn't hit the
-                 * disk yet, the inode will be still be pinned.  If it is,
-                 * force the log.
-                 */
-                if (xfs_ipincount(ip))
                        lsn = ip->i_itemp->ili_last_lsn;
-                xfs_iunlock(ip, XFS_ILOCK_SHARED);
        }
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        if (!error && lsn)
+        if (lsn)
                error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
        /*
@@ -659,9 +601,6 @@ restart:
                return error;
        }
-        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-                file_update_time(file);
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
@@ -685,6 +624,15 @@ restart:
                return error;
        /*
+         * Updating the timestamps will grab the ilock again from
+         * xfs_fs_dirty_inode, so we have to call it after dropping the
+         * lock above.  Eventually we should look into a way to avoid
+         * the pointless lock roundtrip.
+         */
+        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+                file_update_time(file);
+        /*
         * If we're writing the file then make sure to clear the setuid and
         * setgid bits if the process is not being run by root.  This keeps
         * people from modifying setuid and setgid binaries.
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8c3e46394d48..bcc6c249b2c7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -91,7 +91,6 @@ xfs_inode_alloc(
        ip->i_afp = NULL;
        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
        ip->i_flags = 0;
-        ip->i_update_core = 0;
        ip->i_delayed_blks = 0;
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
@@ -290,7 +289,7 @@ xfs_iget_cache_hit(
        if (lock_flags != 0)
                xfs_ilock(ip, lock_flags);
-        xfs_iflags_clear(ip, XFS_ISTALE);
+        xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
        XFS_STATS_INC(xs_ig_found);
        return 0;
@@ -315,6 +314,7 @@ xfs_iget_cache_miss(
        struct xfs_inode        *ip;
        int                     error;
        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
+        int                     iflags;
        ip = xfs_inode_alloc(mp, ino);
        if (!ip)
@@ -350,9 +350,23 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        spin_lock(&pag->pag_ici_lock);
+        /*
+         * These values must be set before inserting the inode into the radix
+         * tree as the moment it is inserted a concurrent lookup (allowed by the
+         * RCU locking mechanism) can find it and that lookup must see that this
+         * is an inode currently under construction (i.e. that XFS_INEW is set).
+         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+         * memory barrier that ensures this detection works correctly at lookup
+         * time.
+         */
+        iflags = XFS_INEW;
+        if (flags & XFS_IGET_DONTCACHE)
+                iflags |= XFS_IDONTCACHE;
+        ip->i_udquot = ip->i_gdquot = NULL;
+        xfs_iflags_set(ip, iflags);
        /* insert the new inode */
+        spin_lock(&pag->pag_ici_lock);
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
        if (unlikely(error)) {
                WARN_ON(error != -EEXIST);
@@ -360,11 +374,6 @@ xfs_iget_cache_miss(
                error = EAGAIN;
                goto out_preload_end;
        }
-        /* These values _must_ be set before releasing the radix tree lock! */
-        ip->i_udquot = ip->i_gdquot = NULL;
-        xfs_iflags_set(ip, XFS_INEW);
        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
@@ -418,6 +427,15 @@ xfs_iget(
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
+        /*
+         * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+         * doesn't get freed while it's being referenced during a
+         * radix tree traversal here.  It assumes this function
+         * aqcuires only the ILOCK (and therefore it has no need to
+         * involve the IOLOCK in this synchronization).
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
        /* reject inode numbers outside existing AGs */
        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
@@ -642,8 +660,7 @@ xfs_iunlock(
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-                        XFS_LOCK_DEP_MASK)) == 0);
        ASSERT(lock_flags != 0);
        if (lock_flags & XFS_IOLOCK_EXCL)
@@ -656,16 +673,6 @@ xfs_iunlock(
        else if (lock_flags & XFS_ILOCK_SHARED)
                mrunlock_shared(&ip->i_lock);
-        if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
-            !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
-                /*
-                 * Let the AIL know that this item has been unlocked in case
-                 * it is in the AIL and anyone is waiting on it.  Don't do
-                 * this if the caller has asked us not to.
-                 */
-                xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
-                                        (xfs_log_item_t*)(ip->i_itemp));
-        }
        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b21022499c2e..bc46c0a133d3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1656,14 +1656,13 @@ retry:
                        iip = ip->i_itemp;
                        if (!iip || xfs_inode_clean(ip)) {
                                ASSERT(ip != free_ip);
-                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
-                        iip->ili_last_fields = iip->ili_format.ilf_fields;
+                        iip->ili_last_fields = iip->ili_fields;
-                        iip->ili_format.ilf_fields = 0;
+                        iip->ili_fields = 0;
                        iip->ili_logged = 1;
                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
                                                &iip->ili_item.li_lsn);
@@ -2177,7 +2176,7 @@ xfs_iflush_fork(
        mp = ip->i_mount;
        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
        case XFS_DINODE_FMT_LOCAL:
-                if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+                if ((iip->ili_fields & dataflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
                        ASSERT(ifp->if_u1.if_data != NULL);
                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
@@ -2187,8 +2186,8 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_EXTENTS:
                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
-                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
+                       !(iip->ili_fields & extflag[whichfork]));
-                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+                if ((iip->ili_fields & extflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
                        ASSERT(xfs_iext_get_ext(ifp, 0));
                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
@@ -2198,7 +2197,7 @@ xfs_iflush_fork(
                break;
        case XFS_DINODE_FMT_BTREE:
-                if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+                if ((iip->ili_fields & brootflag[whichfork]) &&
                    (ifp->if_broot_bytes > 0)) {
                        ASSERT(ifp->if_broot != NULL);
                        ASSERT(ifp->if_broot_bytes <=
@@ -2211,14 +2210,14 @@ xfs_iflush_fork(
                break;
        case XFS_DINODE_FMT_DEV:
-                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+                if (iip->ili_fields & XFS_ILOG_DEV) {
                        ASSERT(whichfork == XFS_DATA_FORK);
                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
                }
                break;
        case XFS_DINODE_FMT_UUID:
-                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+                if (iip->ili_fields & XFS_ILOG_UUID) {
                        ASSERT(whichfork == XFS_DATA_FORK);
                        memcpy(XFS_DFORK_DPTR(dip),
                               &ip->i_df.if_u2.if_uuid,
@@ -2451,9 +2450,8 @@ xfs_iflush(
         * to disk, because the log record didn't make it to disk!
         */
        if (XFS_FORCED_SHUTDOWN(mp)) {
-                ip->i_update_core = 0;
                if (iip)
-                        iip->ili_format.ilf_fields = 0;
+                        iip->ili_fields = 0;
                xfs_ifunlock(ip);
                return XFS_ERROR(EIO);
        }
@@ -2533,26 +2531,6 @@ xfs_iflush_int(
        /* set *dip = inode's place in the buffer */
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
-        /*
-         * Clear i_update_core before copying out the data.
-         * This is for coordination with our timestamp updates
-         * that don't hold the inode lock. They will always
-         * update the timestamps BEFORE setting i_update_core,
-         * so if we clear i_update_core after they set it we
-         * are guaranteed to see their updates to the timestamps.
-         * I believe that this depends on strongly ordered memory
-         * semantics, but we have that.  We use the SYNCHRONIZE
-         * macro to make sure that the compiler does not reorder
-         * the i_update_core access below the data copy below.
-         */
-        ip->i_update_core = 0;
-        SYNCHRONIZE();
-        /*
-         * Make sure to get the latest timestamps from the Linux inode.
-         */
-        xfs_synchronize_times(ip);
        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
@@ -2663,36 +2641,33 @@ xfs_iflush_int(
        xfs_inobp_check(mp, bp);
        /*
-         * We've recorded everything logged in the inode, so we'd
+         * We've recorded everything logged in the inode, so we'd like to clear
-         * like to clear the ilf_fields bits so we don't log and
+         * the ili_fields bits so we don't log and flush things unnecessarily.
-         * flush things unnecessarily.  However, we can't stop
+         * However, we can't stop logging all this information until the data
-         * logging all this information until the data we've copied
+         * we've copied into the disk buffer is written to disk.  If we did we
-         * into the disk buffer is written to disk.  If we did we might
+         * might overwrite the copy of the inode in the log with all the data
-         * overwrite the copy of the inode in the log with all the
+         * after re-logging only part of it, and in the face of a crash we
-         * data after re-logging only part of it, and in the face of
+         * wouldn't have all the data we need to recover.
-         * a crash we wouldn't have all the data we need to recover.
         *
-         * What we do is move the bits to the ili_last_fields field.
+         * What we do is move the bits to the ili_last_fields field.  When
-         * When logging the inode, these bits are moved back to the
+         * logging the inode, these bits are moved back to the ili_fields field.
-         * ilf_fields field.  In the xfs_iflush_done() routine we
+         * In the xfs_iflush_done() routine we clear ili_last_fields, since we
-         * clear ili_last_fields, since we know that the information
+         * know that the information those bits represent is permanently on
-         * those bits represent is permanently on disk.  As long as
+         * disk.  As long as the flush completes before the inode is logged
-         * the flush completes before the inode is logged again, then
+         * again, then both ili_fields and ili_last_fields will be cleared.
-         * both ilf_fields and ili_last_fields will be cleared.
         *
-         * We can play with the ilf_fields bits here, because the inode
+         * We can play with the ili_fields bits here, because the inode lock
-         * lock must be held exclusively in order to set bits there
+         * must be held exclusively in order to set bits there and the flush
-         * and the flush lock protects the ili_last_fields bits.
+         * lock protects the ili_last_fields bits.  Set ili_logged so the flush
-         * Set ili_logged so the flush done
+         * done routine can tell whether or not to look in the AIL.  Also, store
-         * routine can tell whether or not to look in the AIL.
+         * the current LSN of the inode so that we can tell whether the item has
-         * Also, store the current LSN of the inode so that we can tell
+         * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
-         * whether the item has moved in the AIL from xfs_iflush_done().
+         * need the AIL lock, because it is a 64 bit value that cannot be read
-         * In order to read the lsn we need the AIL lock, because
+         * atomically.
-         * it is a 64 bit value that cannot be read atomically.
         */
-        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+        if (iip != NULL && iip->ili_fields != 0) {
-                iip->ili_last_fields = iip->ili_format.ilf_fields;
+                iip->ili_last_fields = iip->ili_fields;
-                iip->ili_format.ilf_fields = 0;
+                iip->ili_fields = 0;
                iip->ili_logged = 1;
                xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
@@ -2711,8 +2686,7 @@ xfs_iflush_int(
        } else {
                /*
                 * We're flushing an inode which is not in the AIL and has
-                 * not been logged but has i_update_core set.  For this
+                 * not been logged.  For this case we can immediately drop
-                 * case we can use a B_DELWRI flush and immediately drop
                 * the inode flush lock because we can avoid the whole
                 * AIL state thing.  It's OK to drop the flush lock now,
                 * because we've already locked the buffer and to do anything
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2f27b7454085..7fee3387e1c8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -241,7 +241,6 @@ typedef struct xfs_inode {
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
        /* Miscellaneous state. */
        unsigned long           i_flags;        /* see defined flags below */
-        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -275,6 +274,20 @@ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
 }
 /*
+ * If this I/O goes past the on-disk inode size update it unless it would
+ * be past the current in-core inode size.
+ */
+static inline xfs_fsize_t
+xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
+{
+        xfs_fsize_t i_size = i_size_read(VFS_I(ip));
+        if (new_size > i_size)
+                new_size = i_size;
+        return new_size > ip->i_d.di_size ? new_size : 0;
+}
+/*
 * i_flags helper functions
 */
 static inline void
@@ -374,10 +387,11 @@ xfs_set_projid(struct xfs_inode *ip,
 #define XFS_IFLOCK              (1 << __XFS_IFLOCK_BIT)
 #define __XFS_IPINNED_BIT       8        /* wakeup key for zero pin count */
 #define XFS_IPINNED             (1 << __XFS_IPINNED_BIT)
+#define XFS_IDONTCACHE          (1 << 9) /* don't cache the inode long term */
 /*
 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
- * inode lookup. Thi prevents unintended behaviour on the new inode from
+ * inode lookup. This prevents unintended behaviour on the new inode from
 * ocurring.
 */
 #define XFS_IRECLAIM_RESET_FLAGS        \
@@ -422,7 +436,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 #define XFS_IOLOCK_SHARED       (1<<1)
 #define XFS_ILOCK_EXCL          (1<<2)
 #define XFS_ILOCK_SHARED        (1<<3)
-#define XFS_IUNLOCK_NONOTIFY    (1<<4)
 #define XFS_LOCK_MASK           (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
                                | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
@@ -431,8 +444,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
        { XFS_IOLOCK_EXCL,      "IOLOCK_EXCL" }, \
        { XFS_IOLOCK_SHARED,    "IOLOCK_SHARED" }, \
        { XFS_ILOCK_EXCL,       "ILOCK_EXCL" }, \
-        { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }, \
+        { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }
-        { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
 /*
@@ -522,10 +534,6 @@ void		xfs_promote_inode(struct xfs_inode *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
-void            xfs_synchronize_times(xfs_inode_t *);
-void            xfs_mark_inode_dirty(xfs_inode_t *);
-void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -546,6 +554,7 @@ do { \
 */
 #define XFS_IGET_CREATE         0x1
 #define XFS_IGET_UNTRUSTED      0x2
+#define XFS_IGET_DONTCACHE      0x4
 int             xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
                            xfs_ino_t, struct xfs_dinode **,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 91d71dcd4852..05d924efceaf 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -57,77 +57,28 @@ xfs_inode_item_size(
        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs = 2;
-        /*
-         * Only log the data/extents/b-tree root if there is something
-         * left to log.
-         */
-        iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
-                iip->ili_format.ilf_fields &=
+                if ((iip->ili_fields & XFS_ILOG_DEXT) &&
-                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                    ip->i_d.di_nextents > 0 &&
-                          XFS_ILOG_DEV | XFS_ILOG_UUID);
+                    ip->i_df.if_bytes > 0)
-                if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
-                    (ip->i_d.di_nextents > 0) &&
-                    (ip->i_df.if_bytes > 0)) {
-                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
                        nvecs++;
-                } else {
-                        iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
-                }
                break;
        case XFS_DINODE_FMT_BTREE:
-                iip->ili_format.ilf_fields &=
+                if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
-                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+                    ip->i_df.if_broot_bytes > 0)
-                          XFS_ILOG_DEV | XFS_ILOG_UUID);
-                if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
-                    (ip->i_df.if_broot_bytes > 0)) {
-                        ASSERT(ip->i_df.if_broot != NULL);
                        nvecs++;
-                } else {
-                        ASSERT(!(iip->ili_format.ilf_fields &
-                                 XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
-                        if (iip->ili_root_size > 0) {
-                                ASSERT(iip->ili_root_size ==
-                                       ip->i_df.if_broot_bytes);
-                                ASSERT(memcmp(iip->ili_orig_root,
-                                            ip->i_df.if_broot,
-                                            iip->ili_root_size) == 0);
-                        } else {
-                                ASSERT(ip->i_df.if_broot_bytes == 0);
-                        }
-#endif
-                        iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
-                }
                break;
        case XFS_DINODE_FMT_LOCAL:
-                iip->ili_format.ilf_fields &=
+                if ((iip->ili_fields & XFS_ILOG_DDATA) &&
-                        ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+                    ip->i_df.if_bytes > 0)
-                          XFS_ILOG_DEV | XFS_ILOG_UUID);
-                if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
-                    (ip->i_df.if_bytes > 0)) {
-                        ASSERT(ip->i_df.if_u1.if_data != NULL);
-                        ASSERT(ip->i_d.di_size > 0);
                        nvecs++;
-                } else {
-                        iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
-                }
                break;
        case XFS_DINODE_FMT_DEV:
-                iip->ili_format.ilf_fields &=
-                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                          XFS_ILOG_DEXT | XFS_ILOG_UUID);
-                break;
        case XFS_DINODE_FMT_UUID:
-                iip->ili_format.ilf_fields &=
-                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                          XFS_ILOG_DEXT | XFS_ILOG_DEV);
                break;
        default:
@@ -135,56 +86,31 @@ xfs_inode_item_size(
                break;
        }
-        /*
+        if (!XFS_IFORK_Q(ip))
-         * If there are no attributes associated with this file,
-         * then there cannot be anything more to log.
-         * Clear all attribute-related log flags.
-         */
-        if (!XFS_IFORK_Q(ip)) {
-                iip->ili_format.ilf_fields &=
-                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
                return nvecs;
-        }
        /*
         * Log any necessary attribute data.
         */
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
-                iip->ili_format.ilf_fields &=
+                if ((iip->ili_fields & XFS_ILOG_AEXT) &&
-                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+                    ip->i_d.di_anextents > 0 &&
-                if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
+                    ip->i_afp->if_bytes > 0)
-                    (ip->i_d.di_anextents > 0) &&
-                    (ip->i_afp->if_bytes > 0)) {
-                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
                        nvecs++;
-                } else {
-                        iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
-                }
                break;
        case XFS_DINODE_FMT_BTREE:
-                iip->ili_format.ilf_fields &=
+                if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
-                        ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+                    ip->i_afp->if_broot_bytes > 0)
-                if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
-                    (ip->i_afp->if_broot_bytes > 0)) {
-                        ASSERT(ip->i_afp->if_broot != NULL);
                        nvecs++;
-                } else {
-                        iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
-                }
                break;
        case XFS_DINODE_FMT_LOCAL:
-                iip->ili_format.ilf_fields &=
+                if ((iip->ili_fields & XFS_ILOG_ADATA) &&
-                        ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+                    ip->i_afp->if_bytes > 0)
-                if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
-                    (ip->i_afp->if_bytes > 0)) {
-                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
                        nvecs++;
-                } else {
-                        iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
-                }
                break;
        default:
@@ -254,48 +180,11 @@ xfs_inode_item_format(
        vecp++;
        nvecs        = 1;
-        /*
-         * Clear i_update_core if the timestamps (or any other
-         * non-transactional modification) need flushing/logging
-         * and we're about to log them with the rest of the core.
-         *
-         * This is the same logic as xfs_iflush() but this code can't
-         * run at the same time as xfs_iflush because we're in commit
-         * processing here and so we have the inode lock held in
-         * exclusive mode.  Although it doesn't really matter
-         * for the timestamps if both routines were to grab the
-         * timestamps or not.  That would be ok.
-         *
-         * We clear i_update_core before copying out the data.
-         * This is for coordination with our timestamp updates
-         * that don't hold the inode lock. They will always
-         * update the timestamps BEFORE setting i_update_core,
-         * so if we clear i_update_core after they set it we
-         * are guaranteed to see their updates to the timestamps
-         * either here.  Likewise, if they set it after we clear it
-         * here, we'll see it either on the next commit of this
-         * inode or the next time the inode gets flushed via
-         * xfs_iflush().  This depends on strongly ordered memory
-         * semantics, but we have that.  We use the SYNCHRONIZE
-         * macro to make sure that the compiler does not reorder
-         * the i_update_core access below the data copy below.
-         */
-        if (ip->i_update_core)  {
-                ip->i_update_core = 0;
-                SYNCHRONIZE();
-        }
-        /*
-         * Make sure to get the latest timestamps from the Linux inode.
-         */
-        xfs_synchronize_times(ip);
        vecp->i_addr = &ip->i_d;
        vecp->i_len  = sizeof(struct xfs_icdinode);
        vecp->i_type = XLOG_REG_TYPE_ICORE;
        vecp++;
        nvecs++;
-        iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
        /*
         * If this is really an old format inode, then we need to
@@ -328,16 +217,17 @@ xfs_inode_item_format(
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                          XFS_ILOG_DEV | XFS_ILOG_UUID)));
+                          XFS_ILOG_DEV | XFS_ILOG_UUID);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
-                        ASSERT(ip->i_df.if_bytes > 0);
+                if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+                    ip->i_d.di_nextents > 0 &&
+                    ip->i_df.if_bytes > 0) {
                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
-                        ASSERT(ip->i_d.di_nextents > 0);
+                        ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
                        ASSERT(iip->ili_extents_buf == NULL);
-                        ASSERT((ip->i_df.if_bytes /
-                                (uint)sizeof(xfs_bmbt_rec_t)) > 0);
 #ifdef XFS_NATIVE_HOST
                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
                                               (uint)sizeof(xfs_bmbt_rec_t)) {
@@ -359,15 +249,18 @@ xfs_inode_item_format(
                        iip->ili_format.ilf_dsize = vecp->i_len;
                        vecp++;
                        nvecs++;
+                } else {
+                        iip->ili_fields &= ~XFS_ILOG_DEXT;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
-                          XFS_ILOG_DEV | XFS_ILOG_UUID)));
+                          XFS_ILOG_DEV | XFS_ILOG_UUID);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
-                        ASSERT(ip->i_df.if_broot_bytes > 0);
+                if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+                    ip->i_df.if_broot_bytes > 0) {
                        ASSERT(ip->i_df.if_broot != NULL);
                        vecp->i_addr = ip->i_df.if_broot;
                        vecp->i_len = ip->i_df.if_broot_bytes;
@@ -375,15 +268,30 @@ xfs_inode_item_format(
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+                } else {
+                        ASSERT(!(iip->ili_fields &
+                                 XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+                        if (iip->ili_root_size > 0) {
+                                ASSERT(iip->ili_root_size ==
+                                       ip->i_df.if_broot_bytes);
+                                ASSERT(memcmp(iip->ili_orig_root,
+                                            ip->i_df.if_broot,
+                                            iip->ili_root_size) == 0);
+                        } else {
+                                ASSERT(ip->i_df.if_broot_bytes == 0);
+                        }
+#endif
+                        iip->ili_fields &= ~XFS_ILOG_DBROOT;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+                        ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
-                          XFS_ILOG_DEV | XFS_ILOG_UUID)));
+                          XFS_ILOG_DEV | XFS_ILOG_UUID);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
+                if ((iip->ili_fields & XFS_ILOG_DDATA) &&
-                        ASSERT(ip->i_df.if_bytes > 0);
+                    ip->i_df.if_bytes > 0) {
                        ASSERT(ip->i_df.if_u1.if_data != NULL);
                        ASSERT(ip->i_d.di_size > 0);
@@ -401,24 +309,26 @@ xfs_inode_item_format(
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+                } else {
+                        iip->ili_fields &= ~XFS_ILOG_DDATA;
                }
                break;
        case XFS_DINODE_FMT_DEV:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                          XFS_ILOG_DDATA | XFS_ILOG_UUID)));
+                          XFS_ILOG_DEXT | XFS_ILOG_UUID);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+                if (iip->ili_fields & XFS_ILOG_DEV) {
                        iip->ili_format.ilf_u.ilfu_rdev =
                                ip->i_df.if_u2.if_rdev;
                }
                break;
        case XFS_DINODE_FMT_UUID:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                          XFS_ILOG_DDATA | XFS_ILOG_DEV)));
+                          XFS_ILOG_DEXT | XFS_ILOG_DEV);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+                if (iip->ili_fields & XFS_ILOG_UUID) {
                        iip->ili_format.ilf_u.ilfu_uuid =
                                ip->i_df.if_u2.if_uuid;
                }
@@ -430,31 +340,25 @@ xfs_inode_item_format(
        }
        /*
-         * If there are no attributes associated with the file,
+         * If there are no attributes associated with the file, then we're done.
-         * then we're done.
-         * Assert that no attribute-related log flags are set.
         */
        if (!XFS_IFORK_Q(ip)) {
-                iip->ili_format.ilf_size = nvecs;
+                iip->ili_fields &=
-                ASSERT(!(iip->ili_format.ilf_fields &
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
-                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+                goto out;
-                return;
        }
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
-#ifdef DEBUG
+                if ((iip->ili_fields & XFS_ILOG_AEXT) &&
-                        int nrecs = ip->i_afp->if_bytes /
+                    ip->i_d.di_anextents > 0 &&
-                                (uint)sizeof(xfs_bmbt_rec_t);
+                    ip->i_afp->if_bytes > 0) {
-                        ASSERT(nrecs > 0);
+                        ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
-                        ASSERT(nrecs == ip->i_d.di_anextents);
+                                ip->i_d.di_anextents);
-                        ASSERT(ip->i_afp->if_bytes > 0);
                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-                        ASSERT(ip->i_d.di_anextents > 0);
-#endif
 #ifdef XFS_NATIVE_HOST
                        /*
                         * There are not delayed allocation extents
@@ -471,29 +375,36 @@ xfs_inode_item_format(
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
+                } else {
+                        iip->ili_fields &= ~XFS_ILOG_AEXT;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
-                        ASSERT(ip->i_afp->if_broot_bytes > 0);
+                if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+                    ip->i_afp->if_broot_bytes > 0) {
                        ASSERT(ip->i_afp->if_broot != NULL);
                        vecp->i_addr = ip->i_afp->if_broot;
                        vecp->i_len = ip->i_afp->if_broot_bytes;
                        vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+                } else {
+                        iip->ili_fields &= ~XFS_ILOG_ABROOT;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
-                ASSERT(!(iip->ili_format.ilf_fields &
+                iip->ili_fields &=
-                         (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+                        ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
-                if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
-                        ASSERT(ip->i_afp->if_bytes > 0);
+                if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+                    ip->i_afp->if_bytes > 0) {
                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
                        vecp->i_addr = ip->i_afp->if_u1.if_data;
@@ -510,6 +421,8 @@ xfs_inode_item_format(
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = (unsigned)data_bytes;
+                } else {
+                        iip->ili_fields &= ~XFS_ILOG_ADATA;
                }
                break;
@@ -518,6 +431,15 @@ xfs_inode_item_format(
                break;
        }
+out:
+        /*
+         * Now update the log format that goes out to disk from the in-core
+         * values.  We always write the inode core to make the arithmetic
+         * games in recovery easier, which isn't a big deal as just about any
+         * transaction would dirty it anyway.
+         */
+        iip->ili_format.ilf_fields = XFS_ILOG_CORE |
+                (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
        iip->ili_format.ilf_size = nvecs;
 }
@@ -596,17 +518,13 @@ xfs_inode_item_trylock(
        /* Stale items should force out the iclog */
        if (ip->i_flags & XFS_ISTALE) {
                xfs_ifunlock(ip);
-                /*
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                 * we hold the AIL lock - notify the unlock routine of this
-                 * so it doesn't try to get the lock again.
-                 */
-                xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
                return XFS_ITEM_PINNED;
        }
 #ifdef DEBUG
        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                ASSERT(iip->ili_format.ilf_fields != 0);
+                ASSERT(iip->ili_fields != 0);
                ASSERT(iip->ili_logged == 0);
                ASSERT(lip->li_flags & XFS_LI_IN_AIL);
        }
@@ -638,7 +556,7 @@ xfs_inode_item_unlock(
        if (iip->ili_extents_buf != NULL) {
                ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
                ASSERT(ip->i_d.di_nextents > 0);
-                ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+                ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
                ASSERT(ip->i_df.if_bytes > 0);
                kmem_free(iip->ili_extents_buf);
                iip->ili_extents_buf = NULL;
@@ -646,7 +564,7 @@ xfs_inode_item_unlock(
        if (iip->ili_aextents_buf != NULL) {
                ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
                ASSERT(ip->i_d.di_anextents > 0);
-                ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
+                ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
                ASSERT(ip->i_afp->if_bytes > 0);
                kmem_free(iip->ili_aextents_buf);
                iip->ili_aextents_buf = NULL;
@@ -761,8 +679,7 @@ xfs_inode_item_push(
         * lock without sleeping, then there must not have been
         * anyone in the process of flushing the inode.
         */
-        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
-               iip->ili_format.ilf_fields != 0);
        /*
         * Push the inode to it's backing buffer. This will not remove the
@@ -985,7 +902,7 @@ xfs_iflush_abort(
                 * Clear the inode logging fields so no more flushes are
                 * attempted.
                 */
-                iip->ili_format.ilf_fields = 0;
+                iip->ili_fields = 0;
        }
        /*
         * Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index d3dee61e6d91..41d61c3b7a36 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -86,6 +86,15 @@ typedef struct xfs_inode_log_format_64 {
 #define XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
 #define XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP      0x4000
 #define XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
                                 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
                                 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
@@ -101,7 +110,7 @@ typedef struct xfs_inode_log_format_64 {
                                 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
                                 XFS_ILOG_DEV | XFS_ILOG_UUID | \
                                 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                 XFS_ILOG_ABROOT)
+                                 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
 static inline int xfs_ilog_fbroot(int w)
 {
@@ -134,6 +143,7 @@ typedef struct xfs_inode_log_item {
        unsigned short          ili_lock_flags;    /* lock flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
+        unsigned int            ili_fields;        /* fields to be logged */
        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
                                                      data exts */
        struct xfs_bmbt_rec     *ili_aextents_buf; /* array of logged
@@ -148,9 +158,7 @@ typedef struct xfs_inode_log_item {
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
-        return (!ip->i_itemp ||
+        return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
-                !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-               !ip->i_update_core;
 }
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 76f3ca5cfc36..91f8ff547ab3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -209,6 +209,7 @@ xfs_open_by_handle(
        struct file             *filp;
        struct inode            *inode;
        struct dentry           *dentry;
+        fmode_t                 fmode;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
@@ -228,26 +229,21 @@ xfs_open_by_handle(
        hreq->oflags |= O_LARGEFILE;
 #endif
-        /* Put open permission in namei format. */
        permflag = hreq->oflags;
-        if ((permflag+1) & O_ACCMODE)
+        fmode = OPEN_FMODE(permflag);
-                permflag++;
-        if (permflag & O_TRUNC)
-                permflag |= 2;
        if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
-            (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
+            (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
                error = -XFS_ERROR(EPERM);
                goto out_dput;
        }
-        if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+        if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
                error = -XFS_ERROR(EACCES);
                goto out_dput;
        }
        /* Can't write directories. */
-        if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+        if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
                error = -XFS_ERROR(EISDIR);
                goto out_dput;
        }
@@ -450,9 +446,12 @@ xfs_attrmulti_attr_get(
        if (*len > XATTR_SIZE_MAX)
                return EINVAL;
-        kbuf = kmalloc(*len, GFP_KERNEL);
+        kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
-        if (!kbuf)
+        if (!kbuf) {
-                return ENOMEM;
+                kbuf = kmem_zalloc_large(*len);
+                if (!kbuf)
+                        return ENOMEM;
+        }
        error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
        if (error)
@@ -462,7 +461,10 @@ xfs_attrmulti_attr_get(
                error = EFAULT;
 out_kfree:
-        kfree(kbuf);
+        if (is_vmalloc_addr(kbuf))
+                kmem_free_large(kbuf);
+        else
+                kmem_free(kbuf);
        return error;
 }
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index f9ccb7b7c043..a849a5473aff 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -293,7 +293,7 @@ xfs_compat_ioc_bulkstat(
                int res;
                error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
-                                sizeof(compat_xfs_bstat_t), 0, &res);
+                                sizeof(compat_xfs_bstat_t), NULL, &res);
        } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
                error = xfs_bulkstat(mp, &inlast, &count,
                        xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 246c7d57c6f9..71a464503c43 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -31,6 +31,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -645,6 +646,7 @@ xfs_iomap_write_unwritten(
        xfs_trans_t     *tp;
        xfs_bmbt_irec_t imap;
        xfs_bmap_free_t free_list;
+        xfs_fsize_t     i_size;
        uint            resblks;
        int             committed;
        int             error;
@@ -705,7 +707,22 @@ xfs_iomap_write_unwritten(
                if (error)
                        goto error_on_bmapi_transaction;
-                error = xfs_bmap_finish(&(tp), &(free_list), &committed);
+                /*
+                 * Log the updated inode size as we go.  We have to be careful
+                 * to only log it up to the actual write offset if it is
+                 * halfway into a block.
+                 */
+                i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+                if (i_size > offset + count)
+                        i_size = offset + count;
+                i_size = xfs_new_eof(ip, i_size);
+                if (i_size) {
+                        ip->i_d.di_size = i_size;
+                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                }
+                error = xfs_bmap_finish(&tp, &free_list, &committed);
                if (error)
                        goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab302539e5b9..3011b879f850 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -50,65 +50,15 @@
 #include <linux/fiemap.h>
 #include <linux/slab.h>
-/*
+static int
- * Bring the timestamps in the XFS inode uptodate.
+xfs_initxattrs(
- *
+        struct inode            *inode,
- * Used before writing the inode to disk.
+        const struct xattr      *xattr_array,
- */
+        void                    *fs_info)
-void
-xfs_synchronize_times(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
-        ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
-        ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
-        ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
-        ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
-        ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
-}
-/*
- * If the linux inode is valid, mark it dirty, else mark the dirty state
- * in the XFS inode to make sure we pick it up when reclaiming the inode.
- */
-void
-xfs_mark_inode_dirty_sync(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
-                mark_inode_dirty_sync(inode);
-        else {
-                barrier();
-                ip->i_update_core = 1;
-        }
-}
-void
-xfs_mark_inode_dirty(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
-                mark_inode_dirty(inode);
-        else {
-                barrier();
-                ip->i_update_core = 1;
-        }
-}
-int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                   void *fs_info)
 {
-        const struct xattr *xattr;
+        const struct xattr      *xattr;
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode        *ip = XFS_I(inode);
-        int error = 0;
+        int                     error = 0;
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                error = xfs_attr_set(ip, xattr->name, xattr->value,
@@ -678,19 +628,16 @@ xfs_setattr_nonsize(
                inode->i_atime = iattr->ia_atime;
                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-                ip->i_update_core = 1;
        }
        if (mask & ATTR_CTIME) {
                inode->i_ctime = iattr->ia_ctime;
                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-                ip->i_update_core = 1;
        }
        if (mask & ATTR_MTIME) {
                inode->i_mtime = iattr->ia_mtime;
                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-                ip->i_update_core = 1;
        }
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -918,13 +865,11 @@ xfs_setattr_size(
                inode->i_ctime = iattr->ia_ctime;
                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-                ip->i_update_core = 1;
        }
        if (mask & ATTR_MTIME) {
                inode->i_mtime = iattr->ia_mtime;
                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-                ip->i_update_core = 1;
        }
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 751e94fe1f77..acc2bf264dab 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -62,7 +62,6 @@ xfs_bulkstat_one_int(
 {
        struct xfs_icdinode     *dic;           /* dinode core info pointer */
        struct xfs_inode        *ip;            /* incore inode pointer */
-        struct inode            *inode;
        struct xfs_bstat        *buf;           /* return buffer */
        int                     error = 0;      /* error value */
@@ -76,7 +75,8 @@ xfs_bulkstat_one_int(
                return XFS_ERROR(ENOMEM);
        error = xfs_iget(mp, NULL, ino,
-                         XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
+                         (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
+                         XFS_ILOCK_SHARED, &ip);
        if (error) {
                *stat = BULKSTAT_RV_NOTHING;
                goto out_free;
@@ -86,7 +86,6 @@ xfs_bulkstat_one_int(
        ASSERT(ip->i_imap.im_blkno != 0);
        dic = &ip->i_d;
-        inode = VFS_I(ip);
        /* xfs_iget returns the following without needing
         * further change.
@@ -99,19 +98,12 @@ xfs_bulkstat_one_int(
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
+        buf->bs_atime.tv_sec = dic->di_atime.t_sec;
-        /*
+        buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
-         * We need to read the timestamps from the Linux inode because
+        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
-         * the VFS keeps writing directly into the inode structure instead
+        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
-         * of telling us about the updates.
+        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
-         */
+        buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
-        buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
-        buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
-        buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
-        buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
-        buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
-        buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
        buf->bs_xflags = xfs_ip2xflags(ip);
        buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
        buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e2cc3568c299..6db1fef38bff 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -67,15 +67,10 @@ STATIC void xlog_state_switch_iclogs(xlog_t		*log,
                                     int                eventual_size);
 STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
-/* local functions to manipulate grant head */
-STATIC int  xlog_grant_log_space(xlog_t         *log,
-                                 xlog_ticket_t  *xtic);
 STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
-STATIC int xlog_regrant_write_log_space(xlog_t          *log,
-                                         xlog_ticket_t  *ticket);
 STATIC void xlog_ungrant_log_space(xlog_t        *log,
                                   xlog_ticket_t *ticket);
@@ -150,78 +145,93 @@ xlog_grant_add_space(
        } while (head_val != old);
 }
-STATIC bool
+STATIC void
-xlog_reserveq_wake(
+xlog_grant_head_init(
-        struct log              *log,
+        struct xlog_grant_head  *head)
-        int                     *free_bytes)
+{
+        xlog_assign_grant_head(&head->grant, 1, 0);
+        INIT_LIST_HEAD(&head->waiters);
+        spin_lock_init(&head->lock);
+}
+STATIC void
+xlog_grant_head_wake_all(
+        struct xlog_grant_head  *head)
 {
        struct xlog_ticket      *tic;
-        int                     need_bytes;
-        list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+        spin_lock(&head->lock);
+        list_for_each_entry(tic, &head->waiters, t_queue)
+                wake_up_process(tic->t_task);
+        spin_unlock(&head->lock);
+}
+static inline int
+xlog_ticket_reservation(
+        struct log              *log,
+        struct xlog_grant_head  *head,
+        struct xlog_ticket      *tic)
+{
+        if (head == &log->l_write_head) {
+                ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+                return tic->t_unit_res;
+        } else {
                if (tic->t_flags & XLOG_TIC_PERM_RESERV)
-                        need_bytes = tic->t_unit_res * tic->t_cnt;
+                        return tic->t_unit_res * tic->t_cnt;
                else
-                        need_bytes = tic->t_unit_res;
+                        return tic->t_unit_res;
-                if (*free_bytes < need_bytes)
-                        return false;
-                *free_bytes -= need_bytes;
-                trace_xfs_log_grant_wake_up(log, tic);
-                wake_up(&tic->t_wait);
        }
-        return true;
 }
 STATIC bool
-xlog_writeq_wake(
+xlog_grant_head_wake(
        struct log              *log,
+        struct xlog_grant_head  *head,
        int                     *free_bytes)
 {
        struct xlog_ticket      *tic;
        int                     need_bytes;
-        list_for_each_entry(tic, &log->l_writeq, t_queue) {
+        list_for_each_entry(tic, &head->waiters, t_queue) {
-                ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+                need_bytes = xlog_ticket_reservation(log, head, tic);
-                need_bytes = tic->t_unit_res;
                if (*free_bytes < need_bytes)
                        return false;
-                *free_bytes -= need_bytes;
-                trace_xfs_log_regrant_write_wake_up(log, tic);
+                *free_bytes -= need_bytes;
-                wake_up(&tic->t_wait);
+                trace_xfs_log_grant_wake_up(log, tic);
+                wake_up_process(tic->t_task);
        }
        return true;
 }
 STATIC int
-xlog_reserveq_wait(
+xlog_grant_head_wait(
        struct log              *log,
+        struct xlog_grant_head  *head,
        struct xlog_ticket      *tic,
        int                     need_bytes)
 {
-        list_add_tail(&tic->t_queue, &log->l_reserveq);
+        list_add_tail(&tic->t_queue, &head->waiters);
        do {
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto shutdown;
                xlog_grant_push_ail(log, need_bytes);
+                __set_current_state(TASK_UNINTERRUPTIBLE);
+                spin_unlock(&head->lock);
                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_grant_sleep(log, tic);
-                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_sleep(log, tic);
+                schedule();
                trace_xfs_log_grant_wake(log, tic);
-                spin_lock(&log->l_grant_reserve_lock);
+                spin_lock(&head->lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto shutdown;
-        } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+        } while (xlog_space_left(log, &head->grant) < need_bytes);
        list_del_init(&tic->t_queue);
        return 0;
@@ -230,35 +240,58 @@ shutdown:
        return XFS_ERROR(EIO);
 }
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto head->waiters, it will only return after the
+ * needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off head->waiters under head->lock, we
+ * only need to take that lock if we are going to add the ticket to the queue
+ * and sleep. We can avoid taking the lock if the ticket was never added to
+ * head->waiters because the t_queue list head will be empty and we hold the
+ * only reference to it so it can safely be checked unlocked.
+ */
 STATIC int
-xlog_writeq_wait(
+xlog_grant_head_check(
        struct log              *log,
+        struct xlog_grant_head  *head,
        struct xlog_ticket      *tic,
-        int                     need_bytes)
+        int                     *need_bytes)
 {
-        list_add_tail(&tic->t_queue, &log->l_writeq);
+        int                     free_bytes;
+        int                     error = 0;
-        do {
-                if (XLOG_FORCED_SHUTDOWN(log))
-                        goto shutdown;
-                xlog_grant_push_ail(log, need_bytes);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep(log, tic);
-                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+        ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-                trace_xfs_log_regrant_write_wake(log, tic);
-                spin_lock(&log->l_grant_write_lock);
+        /*
-                if (XLOG_FORCED_SHUTDOWN(log))
+         * If there are other waiters on the queue then give them a chance at
-                        goto shutdown;
+         * logspace before us.  Wake up the first waiters, if we do not wake
-        } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+         * up all the waiters then go to sleep waiting for more free space,
+         * otherwise try to get some space for this transaction.
+         */
+        *need_bytes = xlog_ticket_reservation(log, head, tic);
+        free_bytes = xlog_space_left(log, &head->grant);
+        if (!list_empty_careful(&head->waiters)) {
+                spin_lock(&head->lock);
+                if (!xlog_grant_head_wake(log, head, &free_bytes) ||
+                    free_bytes < *need_bytes) {
+                        error = xlog_grant_head_wait(log, head, tic,
+                                                     *need_bytes);
+                }
+                spin_unlock(&head->lock);
+        } else if (free_bytes < *need_bytes) {
+                spin_lock(&head->lock);
+                error = xlog_grant_head_wait(log, head, tic, *need_bytes);
+                spin_unlock(&head->lock);
+        }
-        list_del_init(&tic->t_queue);
+        return error;
-        return 0;
-shutdown:
-        list_del_init(&tic->t_queue);
-        return XFS_ERROR(EIO);
 }
 static void
@@ -286,6 +319,128 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
 }
 /*
+ * Replenish the byte reservation required by moving the grant write head.
+ */
+int
+xfs_log_regrant(
+        struct xfs_mount        *mp,
+        struct xlog_ticket      *tic)
+{
+        struct log              *log = mp->m_log;
+        int                     need_bytes;
+        int                     error = 0;
+        if (XLOG_FORCED_SHUTDOWN(log))
+                return XFS_ERROR(EIO);
+        XFS_STATS_INC(xs_try_logspace);
+        /*
+         * This is a new transaction on the ticket, so we need to change the
+         * transaction ID so that the next transaction has a different TID in
+         * the log. Just add one to the existing tid so that we can see chains
+         * of rolling transactions in the log easily.
+         */
+        tic->t_tid++;
+        xlog_grant_push_ail(log, tic->t_unit_res);
+        tic->t_curr_res = tic->t_unit_res;
+        xlog_tic_reset_res(tic);
+        if (tic->t_cnt > 0)
+                return 0;
+        trace_xfs_log_regrant(log, tic);
+        error = xlog_grant_head_check(log, &log->l_write_head, tic,
+                                      &need_bytes);
+        if (error)
+                goto out_error;
+        xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+        trace_xfs_log_regrant_exit(log, tic);
+        xlog_verify_grant_tail(log);
+        return 0;
+out_error:
+        /*
+         * If we are failing, make sure the ticket doesn't have any current
+         * reservations.  We don't want to add this back when the ticket/
+         * transaction gets cancelled.
+         */
+        tic->t_curr_res = 0;
+        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+        return error;
+}
+/*
+ * Reserve log space and return a ticket corresponding the reservation.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+        struct xfs_mount        *mp,
+        int                     unit_bytes,
+        int                     cnt,
+        struct xlog_ticket      **ticp,
+        __uint8_t               client,
+        bool                    permanent,
+        uint                    t_type)
+{
+        struct log              *log = mp->m_log;
+        struct xlog_ticket      *tic;
+        int                     need_bytes;
+        int                     error = 0;
+        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+        if (XLOG_FORCED_SHUTDOWN(log))
+                return XFS_ERROR(EIO);
+        XFS_STATS_INC(xs_try_logspace);
+        ASSERT(*ticp == NULL);
+        tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
+                                KM_SLEEP | KM_MAYFAIL);
+        if (!tic)
+                return XFS_ERROR(ENOMEM);
+        tic->t_trans_type = t_type;
+        *ticp = tic;
+        xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+        trace_xfs_log_reserve(log, tic);
+        error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
+                                      &need_bytes);
+        if (error)
+                goto out_error;
+        xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
+        xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+        trace_xfs_log_reserve_exit(log, tic);
+        xlog_verify_grant_tail(log);
+        return 0;
+out_error:
+        /*
+         * If we are failing, make sure the ticket doesn't have any current
+         * reservations.  We don't want to add this back when the ticket/
+         * transaction gets cancelled.
+         */
+        tic->t_curr_res = 0;
+        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+        return error;
+}
+/*
 * NOTES:
 *
 *      1. currblock field gets updated at startup and after in-core logs
@@ -395,88 +550,6 @@ xfs_log_release_iclog(
 }
 /*
- *  1. Reserve an amount of on-disk log space and return a ticket corresponding
- *      to the reservation.
- *  2. Potentially, push buffers at tail of log to disk.
- *
- * Each reservation is going to reserve extra space for a log record header.
- * When writes happen to the on-disk log, we don't subtract the length of the
- * log record header from any reservation.  By wasting space in each
- * reservation, we prevent over allocation problems.
- */
-int
-xfs_log_reserve(
-        struct xfs_mount        *mp,
-        int                     unit_bytes,
-        int                     cnt,
-        struct xlog_ticket      **ticket,
-        __uint8_t               client,
-        uint                    flags,
-        uint                    t_type)
-{
-        struct log              *log = mp->m_log;
-        struct xlog_ticket      *internal_ticket;
-        int                     retval = 0;
-        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-        if (XLOG_FORCED_SHUTDOWN(log))
-                return XFS_ERROR(EIO);
-        XFS_STATS_INC(xs_try_logspace);
-        if (*ticket != NULL) {
-                ASSERT(flags & XFS_LOG_PERM_RESERV);
-                internal_ticket = *ticket;
-                /*
-                 * this is a new transaction on the ticket, so we need to
-                 * change the transaction ID so that the next transaction has a
-                 * different TID in the log. Just add one to the existing tid
-                 * so that we can see chains of rolling transactions in the log
-                 * easily.
-                 */
-                internal_ticket->t_tid++;
-                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
-                retval = xlog_regrant_write_log_space(log, internal_ticket);
-        } else {
-                /* may sleep if need to allocate more tickets */
-                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-                                                  client, flags,
-                                                  KM_SLEEP|KM_MAYFAIL);
-                if (!internal_ticket)
-                        return XFS_ERROR(ENOMEM);
-                internal_ticket->t_trans_type = t_type;
-                *ticket = internal_ticket;
-                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(log,
-                                    (internal_ticket->t_unit_res *
-                                     internal_ticket->t_cnt));
-                retval = xlog_grant_log_space(log, internal_ticket);
-        }
-        if (unlikely(retval)) {
-                /*
-                 * If we are failing, make sure the ticket doesn't have any
-                 * current reservations.  We don't want to add this back
-                 * when the ticket/ transaction gets cancelled.
-                 */
-                internal_ticket->t_curr_res = 0;
-                /* ungrant will give back unit_res * t_cnt. */
-                internal_ticket->t_cnt = 0;
-        }
-        return retval;
-}
-/*
 * Mount a log filesystem
 *
 * mp           - ubiquitous xfs mount point structure
@@ -653,8 +726,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                                .lv_iovecp = &reg,
                        };
-                        /* remove inited flag */
+                        /* remove inited flag, and account for space used */
                        tic->t_flags = 0;
+                        tic->t_curr_res -= sizeof(magic);
                        error = xlog_write(log, &vec, tic, &lsn,
                                           NULL, XLOG_UNMOUNT_TRANS);
                        /*
@@ -760,64 +834,35 @@ xfs_log_item_init(
        INIT_LIST_HEAD(&item->li_cil);
 }
+/*
+ * Wake up processes waiting for log space after we have moved the log tail.
+ */
 void
-xfs_log_move_tail(xfs_mount_t   *mp,
+xfs_log_space_wake(
-                  xfs_lsn_t     tail_lsn)
+        struct xfs_mount        *mp)
 {
-        xlog_ticket_t   *tic;
+        struct log              *log = mp->m_log;
-        xlog_t          *log = mp->m_log;
+        int                     free_bytes;
-        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0)
+        if (!list_empty_careful(&log->l_write_head.waiters)) {
-                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+                ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-        if (tail_lsn != 1)
-                atomic64_set(&log->l_tail_lsn, tail_lsn);
-        if (!list_empty_careful(&log->l_writeq)) {
-#ifdef DEBUG
-                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                        panic("Recovery problem");
-#endif
-                spin_lock(&log->l_grant_write_lock);
-                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
-                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
+                spin_lock(&log->l_write_head.lock);
-                                break;
+                free_bytes = xlog_space_left(log, &log->l_write_head.grant);
-                        tail_lsn = 0;
+                xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
-                        free_bytes -= tic->t_unit_res;
+                spin_unlock(&log->l_write_head.lock);
-                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        wake_up(&tic->t_wait);
-                }
-                spin_unlock(&log->l_grant_write_lock);
        }
-        if (!list_empty_careful(&log->l_reserveq)) {
+        if (!list_empty_careful(&log->l_reserve_head.waiters)) {
-#ifdef DEBUG
+                ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                        panic("Recovery problem");
+                spin_lock(&log->l_reserve_head.lock);
-#endif
+                free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
-                spin_lock(&log->l_grant_reserve_lock);
+                xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
-                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+                spin_unlock(&log->l_reserve_head.lock);
-                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
-                                need_bytes = tic->t_unit_res*tic->t_cnt;
-                        else
-                                need_bytes = tic->t_unit_res;
-                        if (free_bytes < need_bytes && tail_lsn != 1)
-                                break;
-                        tail_lsn = 0;
-                        free_bytes -= need_bytes;
-                        trace_xfs_log_grant_wake_up(log, tic);
-                        wake_up(&tic->t_wait);
-                }
-                spin_unlock(&log->l_grant_reserve_lock);
        }
 }
@@ -867,21 +912,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        return needed;
 }
-/******************************************************************************
+/*
- *
- *      local routines
- *
- ******************************************************************************
- */
-/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
- * The log manager must keep track of the last LR which was committed
- * to disk.  The lsn of this LR will become the new tail_lsn whenever
- * xfs_trans_tail_ail returns 0.  If we don't do this, we run into
- * the situation where stuff could be written into the log but nothing
- * was ever in the AIL when asked.  Eventually, we panic since the
- * tail hits the head.
- *
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
@@ -891,10 +922,17 @@ xlog_assign_tail_lsn(
        xfs_lsn_t               tail_lsn;
        struct log              *log = mp->m_log;
+        /*
+         * To make sure we always have a valid LSN for the log tail we keep
+         * track of the last LSN which was committed in log->l_last_sync_lsn,
+         * and use that when the AIL was empty and xfs_ail_min_lsn returns 0.
+         *
+         * If the AIL has been emptied we also need to wake any process
+         * waiting for this condition.
+         */
        tail_lsn = xfs_ail_min_lsn(mp->m_ail);
        if (!tail_lsn)
                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
 }
@@ -1100,12 +1138,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        xlog_grant_head_init(&log->l_reserve_head);
-        INIT_LIST_HEAD(&log->l_reserveq);
+        xlog_grant_head_init(&log->l_write_head);
-        INIT_LIST_HEAD(&log->l_writeq);
-        spin_lock_init(&log->l_grant_reserve_lock);
-        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1280,7 +1315,7 @@ xlog_grant_push_ail(
        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+        free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
        free_blocks = BTOBBT(free_bytes);
        /*
@@ -1412,8 +1447,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
+        xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
-        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
+        xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -2566,119 +2601,6 @@ restart:
        return 0;
 }       /* xlog_state_get_iclog_space */
-/*
- * Atomically get the log space required for a log ticket.
- *
- * Once a ticket gets put onto the reserveq, it will only return after the
- * needed reservation is satisfied.
- *
- * This function is structured so that it has a lock free fast path. This is
- * necessary because every new transaction reservation will come through this
- * path. Hence any lock will be globally hot if we take it unconditionally on
- * every pass.
- *
- * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going to add
- * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
- * was never added to the reserveq because the t_queue list head will be empty
- * and we hold the only reference to it so it can safely be checked unlocked.
- */
-STATIC int
-xlog_grant_log_space(
-        struct log              *log,
-        struct xlog_ticket      *tic)
-{
-        int                     free_bytes, need_bytes;
-        int                     error = 0;
-        ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-        trace_xfs_log_grant_enter(log, tic);
-        /*
-         * If there are other waiters on the queue then give them a chance at
-         * logspace before us.  Wake up the first waiters, if we do not wake
-         * up all the waiters then go to sleep waiting for more free space,
-         * otherwise try to get some space for this transaction.
-         */
-        need_bytes = tic->t_unit_res;
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes *= tic->t_ocnt;
-        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-        if (!list_empty_careful(&log->l_reserveq)) {
-                spin_lock(&log->l_grant_reserve_lock);
-                if (!xlog_reserveq_wake(log, &free_bytes) ||
-                    free_bytes < need_bytes)
-                        error = xlog_reserveq_wait(log, tic, need_bytes);
-                spin_unlock(&log->l_grant_reserve_lock);
-        } else if (free_bytes < need_bytes) {
-                spin_lock(&log->l_grant_reserve_lock);
-                error = xlog_reserveq_wait(log, tic, need_bytes);
-                spin_unlock(&log->l_grant_reserve_lock);
-        }
-        if (error)
-                return error;
-        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
-        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
-        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_tail(log);
-        return 0;
-}
-/*
- * Replenish the byte reservation required by moving the grant write head.
- *
- * Similar to xlog_grant_log_space, the function is structured to have a lock
- * free fast path.
- */
-STATIC int
-xlog_regrant_write_log_space(
-        struct log              *log,
-        struct xlog_ticket      *tic)
-{
-        int                     free_bytes, need_bytes;
-        int                     error = 0;
-        tic->t_curr_res = tic->t_unit_res;
-        xlog_tic_reset_res(tic);
-        if (tic->t_cnt > 0)
-                return 0;
-        ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-        trace_xfs_log_regrant_write_enter(log, tic);
-        /*
-         * If there are other waiters on the queue then give them a chance at
-         * logspace before us.  Wake up the first waiters, if we do not wake
-         * up all the waiters then go to sleep waiting for more free space,
-         * otherwise try to get some space for this transaction.
-         */
-        need_bytes = tic->t_unit_res;
-        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-        if (!list_empty_careful(&log->l_writeq)) {
-                spin_lock(&log->l_grant_write_lock);
-                if (!xlog_writeq_wake(log, &free_bytes) ||
-                    free_bytes < need_bytes)
-                        error = xlog_writeq_wait(log, tic, need_bytes);
-                spin_unlock(&log->l_grant_write_lock);
-        } else if (free_bytes < need_bytes) {
-                spin_lock(&log->l_grant_write_lock);
-                error = xlog_writeq_wait(log, tic, need_bytes);
-                spin_unlock(&log->l_grant_write_lock);
-        }
-        if (error)
-                return error;
-        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
-        trace_xfs_log_regrant_write_exit(log, tic);
-        xlog_verify_grant_tail(log);
-        return 0;
-}
 /* The first cnt-1 times through here we don't need to
 * move the grant write head because the permanent
 * reservation has reserved cnt times the unit amount.
@@ -2695,9 +2617,9 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+        xlog_grant_sub_space(log, &log->l_reserve_head.grant,
                                        ticket->t_curr_res);
-        xlog_grant_sub_space(log, &log->l_grant_write_head,
+        xlog_grant_sub_space(log, &log->l_write_head.grant,
                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
@@ -2708,7 +2630,7 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                return;
-        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+        xlog_grant_add_space(log, &log->l_reserve_head.grant,
                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
@@ -2754,14 +2676,13 @@ xlog_ungrant_log_space(xlog_t	     *log,
                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
-        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
-        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+        xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xfs_log_move_tail(log->l_mp, 1);
+        xfs_log_space_wake(log->l_mp);
-}       /* xlog_ungrant_log_space */
+}
 /*
 * Flush iclog to disk if this is the last reference to the given iclog and
@@ -3219,7 +3140,7 @@ xlog_ticket_alloc(
        int             unit_bytes,
        int             cnt,
        char            client,
-        uint            xflags,
+        bool            permanent,
        int             alloc_flags)
 {
        struct xlog_ticket *tic;
@@ -3313,6 +3234,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        tic->t_task             = current;
        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
@@ -3322,9 +3244,8 @@ xlog_ticket_alloc(
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
        tic->t_trans_type       = 0;
-        if (xflags & XFS_LOG_PERM_RESERV)
+        if (permanent)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3380,7 +3301,7 @@ xlog_verify_grant_tail(
        int             tail_cycle, tail_blocks;
        int             cycle, space;
-        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+        xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
        if (tail_cycle != cycle) {
                if (cycle - 1 != tail_cycle &&
@@ -3582,7 +3503,6 @@ xfs_log_force_umount(
        struct xfs_mount        *mp,
        int                     logerror)
 {
-        xlog_ticket_t   *tic;
        xlog_t          *log;
        int             retval;
@@ -3650,15 +3570,8 @@ xfs_log_force_umount(
         * we don't enqueue anything once the SHUTDOWN flag is set, and this
         * action is protected by the grant locks.
         */
-        spin_lock(&log->l_grant_reserve_lock);
+        xlog_grant_head_wake_all(&log->l_reserve_head);
-        list_for_each_entry(tic, &log->l_reserveq, t_queue)
+        xlog_grant_head_wake_all(&log->l_write_head);
-                wake_up(&tic->t_wait);
-        spin_unlock(&log->l_grant_reserve_lock);
-        spin_lock(&log->l_grant_write_lock);
-        list_for_each_entry(tic, &log->l_writeq, t_queue)
-                wake_up(&tic->t_wait);
-        spin_unlock(&log->l_grant_write_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2aee3b22d29c..2c622bedb302 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -53,15 +53,6 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XFS_LOG_REL_PERM_RESERV 0x1
 /*
- * Flags to xfs_log_reserve()
- *
- *      XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
- *              performed against this type of reservation, the reservation
- *              is not decreased.  Long running transactions should use this.
- */
-#define XFS_LOG_PERM_RESERV     0x2
-/*
 * Flags to xfs_log_force()
 *
 *      XFS_LOG_SYNC:   Synchronous force in-core log to disk
@@ -160,8 +151,8 @@ int	  xfs_log_mount(struct xfs_mount	*mp,
                        xfs_daddr_t             start_block,
                        int                     num_bblocks);
 int       xfs_log_mount_finish(struct xfs_mount *mp);
-void      xfs_log_move_tail(struct xfs_mount    *mp,
+xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
-                            xfs_lsn_t           tail_lsn);
+void      xfs_log_space_wake(struct xfs_mount *mp);
 int       xfs_log_notify(struct xfs_mount       *mp,
                         struct xlog_in_core    *iclog,
                         xfs_log_callback_t     *callback_entry);
@@ -172,8 +163,9 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
                          int              count,
                          struct xlog_ticket **ticket,
                          __uint8_t        clientid,
-                          uint             flags,
+                          bool             permanent,
                          uint             t_type);
+int       xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
 int       xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
 int       xfs_log_force_umount(struct xfs_mount *mp, int logerror);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2d3b6a498d63..2152900b79d4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -239,8 +239,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        wait_queue_head_t  t_wait;       /* ticket wait queue */
        struct list_head   t_queue;      /* reserve/write queue */
+        struct task_struct *t_task;      /* task that owns this ticket */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -470,6 +470,16 @@ struct xfs_cil {
 #define XLOG_CIL_HARD_SPACE_LIMIT(log)  (3 * (log->l_logsize >> 4))
 /*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+struct xlog_grant_head {
+        spinlock_t              lock ____cacheline_aligned_in_smp;
+        struct list_head        waiters;
+        atomic64_t              grant;
+};
+/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -520,17 +530,8 @@ typedef struct log {
        /* lsn of 1st LR with unflushed * buffers */
        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
-        /*
+        struct xlog_grant_head  l_reserve_head;
-         * ticket grant locks, queues and accounting have their own cachlines
+        struct xlog_grant_head  l_write_head;
-         * as these are quite hot and can be operated on concurrently.
-         */
-        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
-        struct list_head        l_reserveq;
-        atomic64_t              l_grant_reserve_head;
-        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
-        struct list_head        l_writeq;
-        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -545,14 +546,13 @@ typedef struct log {
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
-extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern kmem_zone_t *xfs_log_ticket_zone;
 struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
-                                int count, char client, uint xflags,
+                                int count, char client, bool permanent,
                                int alloc_flags);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0ed9ee77937c..8ecad5bad66c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -965,9 +965,9 @@ xlog_find_tail(
                log->l_curr_cycle++;
        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+        xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
                                        BBTOB(log->l_curr_block));
-        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+        xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
                                        BBTOB(log->l_curr_block));
        /*
@@ -3161,37 +3161,26 @@ xlog_recover_process_iunlinks(
                         */
                        continue;
                }
+                /*
+                 * Unlock the buffer so that it can be acquired in the normal
+                 * course of the transaction to truncate and free each inode.
+                 * Because we are not racing with anyone else here for the AGI
+                 * buffer, we don't even need to hold it locked to read the
+                 * initial unlinked bucket entries out of the buffer. We keep
+                 * buffer reference though, so that it stays pinned in memory
+                 * while we need the buffer.
+                 */
                agi = XFS_BUF_TO_AGI(agibp);
+                xfs_buf_unlock(agibp);
                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
                        while (agino != NULLAGINO) {
-                                /*
-                                 * Release the agi buffer so that it can
-                                 * be acquired in the normal course of the
-                                 * transaction to truncate and free the inode.
-                                 */
-                                xfs_buf_relse(agibp);
                                agino = xlog_recover_process_one_iunlink(mp,
                                                        agno, agino, bucket);
-                                /*
-                                 * Reacquire the agibuffer and continue around
-                                 * the loop. This should never fail as we know
-                                 * the buffer was good earlier on.
-                                 */
-                                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                                ASSERT(error == 0);
-                                agi = XFS_BUF_TO_AGI(agibp);
                        }
                }
+                xfs_buf_rele(agibp);
-                /*
-                 * Release the buffer for the current agi so we can
-                 * go on to the next one.
-                 */
-                xfs_buf_relse(agibp);
        }
        mp->m_dmevmask = mp_dmevmask;
@@ -3695,7 +3684,7 @@ xlog_do_recover(
        /* Convert superblock from on-disk format */
        sbp = &log->l_mp->m_sb;
-        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+        xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
        ASSERT(xfs_sb_good_version(sbp));
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d06afbc3540d..1ffead4b2296 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -158,7 +158,7 @@ xfs_uuid_mount(
 out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
-        xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
+        xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
        return XFS_ERROR(EINVAL);
 }
@@ -553,9 +553,11 @@ out_unwind:
 void
 xfs_sb_from_disk(
-        xfs_sb_t        *to,
+        struct xfs_mount        *mp,
        xfs_dsb_t       *from)
 {
+        struct xfs_sb *to = &mp->m_sb;
        to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
        to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
        to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -693,7 +695,7 @@ reread:
         * Initialize the mount structure from the superblock.
         * But first do some basic consistency checking.
         */
-        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+        xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
                if (loud)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19f69e232509..9eba73887829 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -211,6 +211,9 @@ typedef struct xfs_mount {
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
+        struct workqueue_struct *m_data_workqueue;
+        struct workqueue_struct *m_unwritten_workqueue;
 } xfs_mount_t;
 /*
@@ -395,7 +398,7 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int      xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
                                        xfs_agnumber_t *);
-extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
 extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index c436def733bf..55c6afedc879 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -48,194 +48,189 @@
 * quota functionality, including maintaining the freelist and hash
 * tables of dquots.
 */
-struct mutex    xfs_Gqm_lock;
-struct xfs_qm   *xfs_Gqm;
-kmem_zone_t     *qm_dqzone;
-kmem_zone_t     *qm_dqtrxzone;
-STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int      xfs_qm_shake(struct shrinker *, struct shrink_control *);
-static struct shrinker xfs_qm_shaker = {
-        .shrink = xfs_qm_shake,
-        .seeks = DEFAULT_SEEKS,
-};
 /*
- * Initialize the XQM structure.
+ * We use the batch lookup interface to iterate over the dquots as it
- * Note that there is not one quota manager per file system.
+ * currently is the only interface into the radix tree code that allows
+ * fuzzy lookups instead of exact matches.  Holding the lock over multiple
+ * operations is fine as all callers are used either during mount/umount
+ * or quotaoff.
 */
-STATIC struct xfs_qm *
+#define XFS_DQ_LOOKUP_BATCH     32
-xfs_Gqm_init(void)
+STATIC int
+xfs_qm_dquot_walk(
+        struct xfs_mount        *mp,
+        int                     type,
+        int                     (*execute)(struct xfs_dquot *dqp))
 {
-        xfs_dqhash_t    *udqhash, *gdqhash;
+        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-        xfs_qm_t        *xqm;
+        struct radix_tree_root  *tree = XFS_DQUOT_TREE(qi, type);
-        size_t          hsize;
+        uint32_t                next_index;
-        uint            i;
+        int                     last_error = 0;
+        int                     skipped;
+        int                     nr_found;
+restart:
+        skipped = 0;
+        next_index = 0;
+        nr_found = 0;
+        while (1) {
+                struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
+                int             error = 0;
+                int             i;
+                mutex_lock(&qi->qi_tree_lock);
+                nr_found = radix_tree_gang_lookup(tree, (void **)batch,
+                                        next_index, XFS_DQ_LOOKUP_BATCH);
+                if (!nr_found) {
+                        mutex_unlock(&qi->qi_tree_lock);
+                        break;
+                }
-        /*
+                for (i = 0; i < nr_found; i++) {
-         * Initialize the dquot hash tables.
+                        struct xfs_dquot *dqp = batch[i];
-         */
-        udqhash = kmem_zalloc_greedy(&hsize,
-                                     XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
-                                     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
-        if (!udqhash)
-                goto out;
-        gdqhash = kmem_zalloc_large(hsize);
+                        next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
-        if (!gdqhash)
-                goto out_free_udqhash;
-        hsize /= sizeof(xfs_dqhash_t);
+                        error = execute(batch[i]);
+                        if (error == EAGAIN) {
+                                skipped++;
+                                continue;
+                        }
+                        if (error && last_error != EFSCORRUPTED)
+                                last_error = error;
+                }
-        xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
+                mutex_unlock(&qi->qi_tree_lock);
-        xqm->qm_dqhashmask = hsize - 1;
-        xqm->qm_usr_dqhtable = udqhash;
-        xqm->qm_grp_dqhtable = gdqhash;
-        ASSERT(xqm->qm_usr_dqhtable != NULL);
-        ASSERT(xqm->qm_grp_dqhtable != NULL);
-        for (i = 0; i < hsize; i++) {
+                /* bail out if the filesystem is corrupted.  */
-                xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
+                if (last_error == EFSCORRUPTED) {
-                xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+                        skipped = 0;
+                        break;
+                }
        }
-        /*
+        if (skipped) {
-         * Freelist of all dquots of all file systems
+                delay(1);
-         */
+                goto restart;
-        INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+        }
-        xqm->qm_dqfrlist_cnt = 0;
-        mutex_init(&xqm->qm_dqfrlist_lock);
-        /*
-         * dquot zone. we register our own low-memory callback.
-         */
-        if (!qm_dqzone) {
-                xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
-                                                "xfs_dquots");
-                qm_dqzone = xqm->qm_dqzone;
-        } else
-                xqm->qm_dqzone = qm_dqzone;
-        register_shrinker(&xfs_qm_shaker);
-        /*
-         * The t_dqinfo portion of transactions.
-         */
-        if (!qm_dqtrxzone) {
-                xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
-                                                   "xfs_dqtrx");
-                qm_dqtrxzone = xqm->qm_dqtrxzone;
-        } else
-                xqm->qm_dqtrxzone = qm_dqtrxzone;
-        atomic_set(&xqm->qm_totaldquots, 0);
-        xqm->qm_nrefs = 0;
-        return xqm;
- out_free_udqhash:
+        return last_error;
-        kmem_free_large(udqhash);
- out:
-        return NULL;
 }
 /*
- * Destroy the global quota manager when its reference count goes to zero.
+ * Purge a dquot from all tracking data structures and free it.
 */
-STATIC void
+STATIC int
-xfs_qm_destroy(
+xfs_qm_dqpurge(
-        struct xfs_qm   *xqm)
+        struct xfs_dquot        *dqp)
 {
-        int             hsize, i;
+        struct xfs_mount        *mp = dqp->q_mount;
+        struct xfs_quotainfo    *qi = mp->m_quotainfo;
+        struct xfs_dquot        *gdqp = NULL;
-        ASSERT(xqm != NULL);
+        xfs_dqlock(dqp);
-        ASSERT(xqm->qm_nrefs == 0);
+        if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
+                xfs_dqunlock(dqp);
+                return EAGAIN;
+        }
-        unregister_shrinker(&xfs_qm_shaker);
+        /*
+         * If this quota has a group hint attached, prepare for releasing it
+         * now.
+         */
+        gdqp = dqp->q_gdquot;
+        if (gdqp) {
+                xfs_dqlock(gdqp);
+                dqp->q_gdquot = NULL;
+        }
-        mutex_lock(&xqm->qm_dqfrlist_lock);
+        dqp->dq_flags |= XFS_DQ_FREEING;
-        ASSERT(list_empty(&xqm->qm_dqfrlist));
-        mutex_unlock(&xqm->qm_dqfrlist_lock);
-        hsize = xqm->qm_dqhashmask + 1;
+        /*
-        for (i = 0; i < hsize; i++) {
+         * If we're turning off quotas, we have to make sure that, for
-                xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
+         * example, we don't delete quota disk blocks while dquots are
-                xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+         * in the process of getting written to those disk blocks.
+         * This dquot might well be on AIL, and we can't leave it there
+         * if we're turning off quotas. Basically, we need this flush
+         * lock, and are willing to block on it.
+         */
+        if (!xfs_dqflock_nowait(dqp)) {
+                /*
+                 * Block on the flush lock after nudging dquot buffer,
+                 * if it is incore.
+                 */
+                xfs_dqflock_pushbuf_wait(dqp);
        }
-        kmem_free_large(xqm->qm_usr_dqhtable);
-        kmem_free_large(xqm->qm_grp_dqhtable);
-        xqm->qm_usr_dqhtable = NULL;
-        xqm->qm_grp_dqhtable = NULL;
-        xqm->qm_dqhashmask = 0;
-        kmem_free(xqm);
-}
-/*
- * Called at mount time to let XQM know that another file system is
- * starting quotas. This isn't crucial information as the individual mount
- * structures are pretty independent, but it helps the XQM keep a
- * global view of what's going on.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_hold_quotafs_ref(
-        struct xfs_mount *mp)
-{
        /*
-         * Need to lock the xfs_Gqm structure for things like this. For example,
+         * If we are turning this type of quotas off, we don't care
-         * the structure could disappear between the entry to this routine and
+         * about the dirty metadata sitting in this dquot. OTOH, if
-         * a HOLD operation if not locked.
+         * we're unmounting, we do care, so we flush it and wait.
         */
-        mutex_lock(&xfs_Gqm_lock);
+        if (XFS_DQ_IS_DIRTY(dqp)) {
+                int     error;
-        if (!xfs_Gqm) {
+                /*
-                xfs_Gqm = xfs_Gqm_init();
+                 * We don't care about getting disk errors here. We need
-                if (!xfs_Gqm) {
+                 * to purge this dquot anyway, so we go ahead regardless.
-                        mutex_unlock(&xfs_Gqm_lock);
+                 */
-                        return ENOMEM;
+                error = xfs_qm_dqflush(dqp, SYNC_WAIT);
-                }
+                if (error)
+                        xfs_warn(mp, "%s: dquot %p flush failed",
+                                __func__, dqp);
+                xfs_dqflock(dqp);
        }
+        ASSERT(atomic_read(&dqp->q_pincount) == 0);
+        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+        xfs_dqfunlock(dqp);
+        xfs_dqunlock(dqp);
+        radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+                          be32_to_cpu(dqp->q_core.d_id));
+        qi->qi_dquots--;
        /*
-         * We can keep a list of all filesystems with quotas mounted for
+         * We move dquots to the freelist as soon as their reference count
-         * debugging and statistical purposes, but ...
+         * hits zero, so it really should be on the freelist here.
-         * Just take a reference and get out.
         */
-        xfs_Gqm->qm_nrefs++;
+        mutex_lock(&qi->qi_lru_lock);
-        mutex_unlock(&xfs_Gqm_lock);
+        ASSERT(!list_empty(&dqp->q_lru));
+        list_del_init(&dqp->q_lru);
+        qi->qi_lru_count--;
+        XFS_STATS_DEC(xs_qm_dquot_unused);
+        mutex_unlock(&qi->qi_lru_lock);
+        xfs_qm_dqdestroy(dqp);
+        if (gdqp)
+                xfs_qm_dqput(gdqp);
        return 0;
 }
 /*
- * Release the reference that a filesystem took at mount time,
+ * Purge the dquot cache.
- * so that we know when we need to destroy the entire quota manager.
 */
-/* ARGSUSED */
+void
-STATIC void
+xfs_qm_dqpurge_all(
-xfs_qm_rele_quotafs_ref(
+        struct xfs_mount        *mp,
-        struct xfs_mount *mp)
+        uint                    flags)
 {
-        ASSERT(xfs_Gqm);
+        if (flags & XFS_QMOPT_UQUOTA)
-        ASSERT(xfs_Gqm->qm_nrefs > 0);
+                xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge);
+        if (flags & XFS_QMOPT_GQUOTA)
-        /*
+                xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge);
-         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
+        if (flags & XFS_QMOPT_PQUOTA)
-         * be restarted.
+                xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge);
-         */
-        mutex_lock(&xfs_Gqm_lock);
-        if (--xfs_Gqm->qm_nrefs == 0) {
-                xfs_qm_destroy(xfs_Gqm);
-                xfs_Gqm = NULL;
-        }
-        mutex_unlock(&xfs_Gqm_lock);
 }
 /*
@@ -376,175 +371,6 @@ xfs_qm_unmount_quotas(
        }
 }
-/*
- * Flush all dquots of the given file system to disk. The dquots are
- * _not_ purged from memory here, just their data written to disk.
- */
-STATIC int
-xfs_qm_dqflush_all(
-        struct xfs_mount        *mp)
-{
-        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        int                     recl;
-        struct xfs_dquot        *dqp;
-        int                     error;
-        if (!q)
-                return 0;
-again:
-        mutex_lock(&q->qi_dqlist_lock);
-        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-                xfs_dqlock(dqp);
-                if ((dqp->dq_flags & XFS_DQ_FREEING) ||
-                    !XFS_DQ_IS_DIRTY(dqp)) {
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                /* XXX a sentinel would be better */
-                recl = q->qi_dqreclaims;
-                if (!xfs_dqflock_nowait(dqp)) {
-                        /*
-                         * If we can't grab the flush lock then check
-                         * to see if the dquot has been flushed delayed
-                         * write.  If so, grab its buffer and send it
-                         * out immediately.  We'll be able to acquire
-                         * the flush lock when the I/O completes.
-                         */
-                        xfs_dqflock_pushbuf_wait(dqp);
-                }
-                /*
-                 * Let go of the mplist lock. We don't want to hold it
-                 * across a disk write.
-                 */
-                mutex_unlock(&q->qi_dqlist_lock);
-                error = xfs_qm_dqflush(dqp, 0);
-                xfs_dqunlock(dqp);
-                if (error)
-                        return error;
-                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != q->qi_dqreclaims) {
-                        mutex_unlock(&q->qi_dqlist_lock);
-                        /* XXX restart limit */
-                        goto again;
-                }
-        }
-        mutex_unlock(&q->qi_dqlist_lock);
-        /* return ! busy */
-        return 0;
-}
-/*
- * Release the group dquot pointers the user dquots may be
- * carrying around as a hint. mplist is locked on entry and exit.
- */
-STATIC void
-xfs_qm_detach_gdquots(
-        struct xfs_mount        *mp)
-{
-        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        struct xfs_dquot        *dqp, *gdqp;
- again:
-        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-                xfs_dqlock(dqp);
-                if (dqp->dq_flags & XFS_DQ_FREEING) {
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&q->qi_dqlist_lock);
-                        delay(1);
-                        mutex_lock(&q->qi_dqlist_lock);
-                        goto again;
-                }
-                gdqp = dqp->q_gdquot;
-                if (gdqp)
-                        dqp->q_gdquot = NULL;
-                xfs_dqunlock(dqp);
-                if (gdqp)
-                        xfs_qm_dqrele(gdqp);
-        }
-}
-/*
- * Go through all the incore dquots of this file system and take them
- * off the mplist and hashlist, if the dquot type matches the dqtype
- * parameter. This is used when turning off quota accounting for
- * users and/or groups, as well as when the filesystem is unmounting.
- */
-STATIC int
-xfs_qm_dqpurge_int(
-        struct xfs_mount        *mp,
-        uint                    flags)
-{
-        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        struct xfs_dquot        *dqp, *n;
-        uint                    dqtype;
-        int                     nmisses = 0;
-        LIST_HEAD               (dispose_list);
-        if (!q)
-                return 0;
-        dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
-        dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
-        dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-        mutex_lock(&q->qi_dqlist_lock);
-        /*
-         * In the first pass through all incore dquots of this filesystem,
-         * we release the group dquot pointers the user dquots may be
-         * carrying around as a hint. We need to do this irrespective of
-         * what's being turned off.
-         */
-        xfs_qm_detach_gdquots(mp);
-        /*
-         * Try to get rid of all of the unwanted dquots.
-         */
-        list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
-                xfs_dqlock(dqp);
-                if ((dqp->dq_flags & dqtype) != 0 &&
-                    !(dqp->dq_flags & XFS_DQ_FREEING)) {
-                        if (dqp->q_nrefs == 0) {
-                                dqp->dq_flags |= XFS_DQ_FREEING;
-                                list_move_tail(&dqp->q_mplist, &dispose_list);
-                        } else
-                                nmisses++;
-                }
-                xfs_dqunlock(dqp);
-        }
-        mutex_unlock(&q->qi_dqlist_lock);
-        list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist)
-                xfs_qm_dqpurge(dqp);
-        return nmisses;
-}
-int
-xfs_qm_dqpurge_all(
-        xfs_mount_t     *mp,
-        uint            flags)
-{
-        int             ndquots;
-        /*
-         * Purge the dquot cache.
-         * None of the dquots should really be busy at this point.
-         */
-        if (mp->m_quotainfo) {
-                while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
-                        delay(ndquots * 10);
-                }
-        }
-        return 0;
-}
 STATIC int
 xfs_qm_dqattach_one(
        xfs_inode_t     *ip,
@@ -783,14 +609,6 @@ xfs_qm_dqdetach(
 }
 /*
- * The hash chains and the mplist use the same xfs_dqhash structure as
- * their list head, but we can take the mplist qh_lock and one of the
- * hash qh_locks at the same time without any problem as they aren't
- * related.
- */
-static struct lock_class_key xfs_quota_mplist_class;
-/*
 * This initializes all the quota information that's kept in the
 * mount structure
 */
@@ -804,13 +622,6 @@ xfs_qm_init_quotainfo(
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-        /*
-         * Tell XQM that we exist as soon as possible.
-         */
-        if ((error = xfs_qm_hold_quotafs_ref(mp))) {
-                return error;
-        }
        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
        /*
@@ -823,11 +634,13 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        INIT_LIST_HEAD(&qinf->qi_dqlist);
+        INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
-        mutex_init(&qinf->qi_dqlist_lock);
+        INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
-        lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
+        mutex_init(&qinf->qi_tree_lock);
-        qinf->qi_dqreclaims = 0;
+        INIT_LIST_HEAD(&qinf->qi_lru_list);
+        qinf->qi_lru_count = 0;
+        mutex_init(&qinf->qi_lru_lock);
        /* mutex used to serialize quotaoffs */
        mutex_init(&qinf->qi_quotaofflock);
@@ -894,6 +707,9 @@ xfs_qm_init_quotainfo(
                qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
        }
+        qinf->qi_shrinker.shrink = xfs_qm_shake;
+        qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&qinf->qi_shrinker);
        return 0;
 }
@@ -911,17 +727,8 @@ xfs_qm_destroy_quotainfo(
        qi = mp->m_quotainfo;
        ASSERT(qi != NULL);
-        ASSERT(xfs_Gqm != NULL);
-        /*
-         * Release the reference that XQM kept, so that we know
-         * when the XQM structure should be freed. We cannot assume
-         * that xfs_Gqm is non-null after this point.
-         */
-        xfs_qm_rele_quotafs_ref(mp);
-        ASSERT(list_empty(&qi->qi_dqlist));
+        unregister_shrinker(&qi->qi_shrinker);
-        mutex_destroy(&qi->qi_dqlist_lock);
        if (qi->qi_uquotaip) {
                IRELE(qi->qi_uquotaip);
@@ -936,30 +743,6 @@ xfs_qm_destroy_quotainfo(
        mp->m_quotainfo = NULL;
 }
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
-        xfs_dqlist_t    *list,
-        char            *str,
-        int             n)
-{
-        mutex_init(&list->qh_lock);
-        INIT_LIST_HEAD(&list->qh_list);
-        list->qh_version = 0;
-        list->qh_nelems = 0;
-}
-STATIC void
-xfs_qm_list_destroy(
-        xfs_dqlist_t    *list)
-{
-        mutex_destroy(&(list->qh_lock));
-}
 /*
 * Create an inode and return with a reference already taken, but unlocked
 * This is how we create quota inodes
@@ -1397,6 +1180,28 @@ error0:
        return error;
 }
+STATIC int
+xfs_qm_flush_one(
+        struct xfs_dquot        *dqp)
+{
+        int                     error = 0;
+        xfs_dqlock(dqp);
+        if (dqp->dq_flags & XFS_DQ_FREEING)
+                goto out_unlock;
+        if (!XFS_DQ_IS_DIRTY(dqp))
+                goto out_unlock;
+        if (!xfs_dqflock_nowait(dqp))
+                xfs_dqflock_pushbuf_wait(dqp);
+        error = xfs_qm_dqflush(dqp, 0);
+out_unlock:
+        xfs_dqunlock(dqp);
+        return error;
+}
 /*
 * Walk thru all the filesystem inodes and construct a consistent view
 * of the disk quota world. If the quotacheck fails, disable quotas.
@@ -1405,7 +1210,7 @@ int
 xfs_qm_quotacheck(
        xfs_mount_t     *mp)
 {
-        int             done, count, error;
+        int             done, count, error, error2;
        xfs_ino_t       lastino;
        size_t          structsz;
        xfs_inode_t     *uip, *gip;
@@ -1419,12 +1224,6 @@ xfs_qm_quotacheck(
        ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-        /*
-         * There should be no cached dquots. The (simplistic) quotacheck
-         * algorithm doesn't like that.
-         */
-        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
        xfs_notice(mp, "Quotacheck needed: Please wait.");
        /*
@@ -1463,12 +1262,21 @@ xfs_qm_quotacheck(
        } while (!done);
        /*
-         * We've made all the changes that we need to make incore.
+         * We've made all the changes that we need to make incore.  Flush them
-         * Flush them down to disk buffers if everything was updated
+         * down to disk buffers if everything was updated successfully.
-         * successfully.
         */
-        if (!error)
+        if (XFS_IS_UQUOTA_ON(mp))
-                error = xfs_qm_dqflush_all(mp);
+                error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one);
+        if (XFS_IS_GQUOTA_ON(mp)) {
+                error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one);
+                if (!error)
+                        error = error2;
+        }
+        if (XFS_IS_PQUOTA_ON(mp)) {
+                error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one);
+                if (!error)
+                        error = error2;
+        }
        /*
         * We can get this error if we couldn't do a dquot allocation inside
@@ -1496,7 +1304,7 @@ xfs_qm_quotacheck(
         * quotachecked status, since we won't be doing accounting for
         * that type anymore.
         */
-        mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
+        mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
        mp->m_qflags |= flags;
 error_return:
@@ -1508,7 +1316,6 @@ xfs_qm_quotacheck(
                 * We must turn off quotas.
                 */
                ASSERT(mp->m_quotainfo != NULL);
-                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
                if (xfs_mount_reset_sbqflags(mp)) {
                        xfs_warn(mp,
@@ -1604,16 +1411,12 @@ xfs_qm_dqfree_one(
        struct xfs_mount        *mp = dqp->q_mount;
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-        mutex_lock(&dqp->q_hash->qh_lock);
+        mutex_lock(&qi->qi_tree_lock);
-        list_del_init(&dqp->q_hashlist);
+        radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
-        dqp->q_hash->qh_version++;
+                          be32_to_cpu(dqp->q_core.d_id));
-        mutex_unlock(&dqp->q_hash->qh_lock);
-        mutex_lock(&qi->qi_dqlist_lock);
-        list_del_init(&dqp->q_mplist);
        qi->qi_dquots--;
-        qi->qi_dqreclaims++;
+        mutex_unlock(&qi->qi_tree_lock);
-        mutex_unlock(&qi->qi_dqlist_lock);
        xfs_qm_dqdestroy(dqp);
 }
@@ -1624,6 +1427,7 @@ xfs_qm_dqreclaim_one(
        struct list_head        *dispose_list)
 {
        struct xfs_mount        *mp = dqp->q_mount;
+        struct xfs_quotainfo    *qi = mp->m_quotainfo;
        int                     error;
        if (!xfs_dqlock_nowait(dqp))
@@ -1637,16 +1441,14 @@ xfs_qm_dqreclaim_one(
                xfs_dqunlock(dqp);
                trace_xfs_dqreclaim_want(dqp);
-                XQM_STATS_INC(xqmstats.xs_qm_dqwants);
+                XFS_STATS_INC(xs_qm_dqwants);
-                list_del_init(&dqp->q_freelist);
+                list_del_init(&dqp->q_lru);
-                xfs_Gqm->qm_dqfrlist_cnt--;
+                qi->qi_lru_count--;
+                XFS_STATS_DEC(xs_qm_dquot_unused);
                return;
        }
-        ASSERT(dqp->q_hash);
-        ASSERT(!list_empty(&dqp->q_mplist));
        /*
         * Try to grab the flush lock. If this dquot is in the process of
         * getting flushed to disk, we don't want to reclaim it.
@@ -1688,11 +1490,12 @@ xfs_qm_dqreclaim_one(
        xfs_dqunlock(dqp);
        ASSERT(dqp->q_nrefs == 0);
-        list_move_tail(&dqp->q_freelist, dispose_list);
+        list_move_tail(&dqp->q_lru, dispose_list);
-        xfs_Gqm->qm_dqfrlist_cnt--;
+        qi->qi_lru_count--;
+        XFS_STATS_DEC(xs_qm_dquot_unused);
        trace_xfs_dqreclaim_done(dqp);
-        XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+        XFS_STATS_INC(xs_qm_dqreclaims);
        return;
 out_busy:
@@ -1701,10 +1504,10 @@ out_busy:
        /*
         * Move the dquot to the tail of the list so that we don't spin on it.
         */
-        list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+        list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
        trace_xfs_dqreclaim_busy(dqp);
-        XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+        XFS_STATS_INC(xs_qm_dqreclaim_misses);
 }
 STATIC int
@@ -1712,6 +1515,8 @@ xfs_qm_shake(
        struct shrinker         *shrink,
        struct shrink_control   *sc)
 {
+        struct xfs_quotainfo    *qi =
+                container_of(shrink, struct xfs_quotainfo, qi_shrinker);
        int                     nr_to_scan = sc->nr_to_scan;
        LIST_HEAD               (dispose_list);
        struct xfs_dquot        *dqp;
@@ -1721,24 +1526,23 @@ xfs_qm_shake(
        if (!nr_to_scan)
                goto out;
-        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+        mutex_lock(&qi->qi_lru_lock);
-        while (!list_empty(&xfs_Gqm->qm_dqfrlist)) {
+        while (!list_empty(&qi->qi_lru_list)) {
                if (nr_to_scan-- <= 0)
                        break;
-                dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot,
+                dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
-                                       q_freelist);
+                                       q_lru);
                xfs_qm_dqreclaim_one(dqp, &dispose_list);
        }
-        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+        mutex_unlock(&qi->qi_lru_lock);
        while (!list_empty(&dispose_list)) {
-                dqp = list_first_entry(&dispose_list, struct xfs_dquot,
+                dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
-                                       q_freelist);
+                list_del_init(&dqp->q_lru);
-                list_del_init(&dqp->q_freelist);
                xfs_qm_dqfree_one(dqp);
        }
 out:
-        return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure;
+        return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
 }
 /*
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9a9b997e1a0a..44b858b79d71 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -21,21 +21,10 @@
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_quota_priv.h"
-#include "xfs_qm_stats.h"
-struct xfs_qm;
 struct xfs_inode;
-extern struct mutex     xfs_Gqm_lock;
+extern struct kmem_zone *xfs_qm_dqtrxzone;
-extern struct xfs_qm    *xfs_Gqm;
-extern kmem_zone_t      *qm_dqzone;
-extern kmem_zone_t      *qm_dqtrxzone;
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW             (PAGE_SIZE / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH            ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
 /*
 * This defines the unit of allocation of dquots.
@@ -48,36 +37,20 @@ extern kmem_zone_t	*qm_dqtrxzone;
 */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB      (xfs_filblks_t)1
-typedef xfs_dqhash_t    xfs_dqlist_t;
-/*
- * Quota Manager (global) structure. Lives only in core.
- */
-typedef struct xfs_qm {
-        xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
-        xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
-        uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
-        struct list_head qm_dqfrlist;    /* freelist of dquots */
-        struct mutex     qm_dqfrlist_lock;
-        int              qm_dqfrlist_cnt;
-        atomic_t         qm_totaldquots; /* total incore dquots */
-        uint             qm_nrefs;       /* file systems with quota on */
-        kmem_zone_t     *qm_dqzone;      /* dquot mem-alloc zone */
-        kmem_zone_t     *qm_dqtrxzone;   /* t_dqinfo of transactions */
-} xfs_qm_t;
 /*
 * Various quota information for individual filesystems.
 * The mount structure keeps a pointer to this.
 */
 typedef struct xfs_quotainfo {
+        struct radix_tree_root qi_uquota_tree;
+        struct radix_tree_root qi_gquota_tree;
+        struct mutex qi_tree_lock;
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        struct list_head qi_dqlist;      /* all dquots in filesys */
+        struct list_head qi_lru_list;
-        struct mutex     qi_dqlist_lock;
+        struct mutex     qi_lru_lock;
+        int              qi_lru_count;
        int              qi_dquots;
-        int              qi_dqreclaims;  /* a change here indicates
-                                            a removal in the dqlist */
        time_t           qi_btimelimit;  /* limit for blks timer */
        time_t           qi_itimelimit;  /* limit for inodes timer */
        time_t           qi_rtbtimelimit;/* limit for rt blks timer */
@@ -93,8 +66,14 @@ typedef struct xfs_quotainfo {
        xfs_qcnt_t       qi_isoftlimit;  /* default inode count soft limit */
        xfs_qcnt_t       qi_rtbhardlimit;/* default realtime blk hard limit */
        xfs_qcnt_t       qi_rtbsoftlimit;/* default realtime blk soft limit */
+        struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
+#define XFS_DQUOT_TREE(qi, type) \
+        ((type & XFS_DQ_USER) ? \
+         &((qi)->qi_uquota_tree) : \
+         &((qi)->qi_gquota_tree))
 extern void     xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
 extern int      xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
@@ -130,7 +109,7 @@ extern int		xfs_qm_quotacheck(xfs_mount_t *);
 extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 /* dquot stuff */
-extern int              xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void             xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void             xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
 /* quota ops */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca9..e6986b5d80d8 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -40,28 +40,28 @@
 STATIC void
 xfs_fill_statvfs_from_dquot(
        struct kstatfs          *statp,
-        xfs_disk_dquot_t        *dp)
+        struct xfs_dquot        *dqp)
 {
        __uint64_t              limit;
-        limit = dp->d_blk_softlimit ?
+        limit = dqp->q_core.d_blk_softlimit ?
-                be64_to_cpu(dp->d_blk_softlimit) :
+                be64_to_cpu(dqp->q_core.d_blk_softlimit) :
-                be64_to_cpu(dp->d_blk_hardlimit);
+                be64_to_cpu(dqp->q_core.d_blk_hardlimit);
        if (limit && statp->f_blocks > limit) {
                statp->f_blocks = limit;
                statp->f_bfree = statp->f_bavail =
-                        (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
+                        (statp->f_blocks > dqp->q_res_bcount) ?
-                         (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+                         (statp->f_blocks - dqp->q_res_bcount) : 0;
        }
-        limit = dp->d_ino_softlimit ?
+        limit = dqp->q_core.d_ino_softlimit ?
-                be64_to_cpu(dp->d_ino_softlimit) :
+                be64_to_cpu(dqp->q_core.d_ino_softlimit) :
-                be64_to_cpu(dp->d_ino_hardlimit);
+                be64_to_cpu(dqp->q_core.d_ino_hardlimit);
        if (limit && statp->f_files > limit) {
                statp->f_files = limit;
                statp->f_ffree =
-                        (statp->f_files > be64_to_cpu(dp->d_icount)) ?
+                        (statp->f_files > dqp->q_res_icount) ?
-                         (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
+                         (statp->f_ffree - dqp->q_res_icount) : 0;
        }
 }
@@ -82,7 +82,7 @@ xfs_qm_statvfs(
        xfs_dquot_t             *dqp;
        if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
-                xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
+                xfs_fill_statvfs_from_dquot(statp, dqp);
                xfs_qm_dqput(dqp);
        }
 }
@@ -156,21 +156,3 @@ xfs_qm_newmount(
        return 0;
 }
-void __init
-xfs_qm_init(void)
-{
-        printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
-        mutex_init(&xfs_Gqm_lock);
-        xfs_qm_init_procfs();
-}
-void __exit
-xfs_qm_exit(void)
-{
-        xfs_qm_cleanup_procfs();
-        if (qm_dqzone)
-                kmem_zone_destroy(qm_dqzone);
-        if (qm_dqtrxzone)
-                kmem_zone_destroy(qm_dqtrxzone);
-}
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
deleted file mode 100644
index 5729ba570877..000000000000
--- a/fs/xfs/xfs_qm_stats.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-struct xqmstats xqmstats;
-static int xqm_proc_show(struct seq_file *m, void *v)
-{
-        /* maximum; incore; ratio free to inuse; freelist */
-        seq_printf(m, "%d\t%d\t%d\t%u\n",
-                        0,
-                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
-                        0,
-                        xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
-        return 0;
-}
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, xqm_proc_show, NULL);
-}
-static const struct file_operations xqm_proc_fops = {
-        .owner          = THIS_MODULE,
-        .open           = xqm_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int xqmstat_proc_show(struct seq_file *m, void *v)
-{
-        /* quota performance statistics */
-        seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
-                        xqmstats.xs_qm_dqreclaims,
-                        xqmstats.xs_qm_dqreclaim_misses,
-                        xqmstats.xs_qm_dquot_dups,
-                        xqmstats.xs_qm_dqcachemisses,
-                        xqmstats.xs_qm_dqcachehits,
-                        xqmstats.xs_qm_dqwants,
-                        xqmstats.xs_qm_dqshake_reclaims,
-                        xqmstats.xs_qm_dqinact_reclaims);
-        return 0;
-}
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, xqmstat_proc_show, NULL);
-}
-static const struct file_operations xqmstat_proc_fops = {
-        .owner          = THIS_MODULE,
-        .open           = xqmstat_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-void
-xfs_qm_init_procfs(void)
-{
-        proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
-        proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
-}
-void
-xfs_qm_cleanup_procfs(void)
-{
-        remove_proc_entry("fs/xfs/xqm", NULL);
-        remove_proc_entry("fs/xfs/xqmstat", NULL);
-}
diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
deleted file mode 100644
index 5b964fc0dc09..000000000000
--- a/fs/xfs/xfs_qm_stats.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QM_STATS_H__
-#define __XFS_QM_STATS_H__
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-/*
- * XQM global statistics
- */
-struct xqmstats {
-        __uint32_t              xs_qm_dqreclaims;
-        __uint32_t              xs_qm_dqreclaim_misses;
-        __uint32_t              xs_qm_dquot_dups;
-        __uint32_t              xs_qm_dqcachemisses;
-        __uint32_t              xs_qm_dqcachehits;
-        __uint32_t              xs_qm_dqwants;
-        __uint32_t              xs_qm_dqshake_reclaims;
-        __uint32_t              xs_qm_dqinact_reclaims;
-};
-extern struct xqmstats xqmstats;
-# define XQM_STATS_INC(count)   ( (count)++ )
-extern void xfs_qm_init_procfs(void);
-extern void xfs_qm_cleanup_procfs(void);
-#else
-# define XQM_STATS_INC(count)   do { } while (0)
-static inline void xfs_qm_init_procfs(void) { };
-static inline void xfs_qm_cleanup_procfs(void) { };
-#endif
-#endif  /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 711a86e39ff0..c4f396e437a8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -47,9 +47,6 @@ STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
 STATIC uint     xfs_qm_export_flags(uint);
 STATIC uint     xfs_qm_export_qtype_flags(uint);
-STATIC void     xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
-                                        fs_disk_quota_t *);
 /*
 * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -69,7 +66,6 @@ xfs_qm_scall_quotaoff(
        int                     error;
        uint                    inactivate_flags;
        xfs_qoff_logitem_t      *qoffstart;
-        int                     nculprits;
        /*
         * No file system can have quotas enabled on disk but not in core.
@@ -175,18 +171,13 @@ xfs_qm_scall_quotaoff(
         * This isn't protected by a particular lock directly, because we
         * don't want to take a mrlock every time we depend on quotas being on.
         */
-        mp->m_qflags &= ~(flags);
+        mp->m_qflags &= ~flags;
        /*
         * Go through all the dquots of this file system and purge them,
-         * according to what was turned off. We may not be able to get rid
+         * according to what was turned off.
-         * of all dquots, because dquots can have temporary references that
-         * are not attached to inodes. eg. xfs_setattr, xfs_create.
-         * So, if we couldn't purge all the dquots from the filesystem,
-         * we can't get rid of the incore data structures.
         */
-        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
+        xfs_qm_dqpurge_all(mp, dqtype);
-                delay(10 * nculprits);
        /*
         * Transactions that had started before ACTIVE state bit was cleared
@@ -635,42 +626,6 @@ xfs_qm_scall_setqlim(
        return error;
 }
-int
-xfs_qm_scall_getquota(
-        xfs_mount_t     *mp,
-        xfs_dqid_t      id,
-        uint            type,
-        fs_disk_quota_t *out)
-{
-        xfs_dquot_t     *dqp;
-        int             error;
-        /*
-         * Try to get the dquot. We don't want it allocated on disk, so
-         * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
-         * exist, we'll get ENOENT back.
-         */
-        if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
-                return (error);
-        }
-        /*
-         * If everything's NULL, this dquot doesn't quite exist as far as
-         * our utility programs are concerned.
-         */
-        if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-                xfs_qm_dqput(dqp);
-                return XFS_ERROR(ENOENT);
-        }
-        /*
-         * Convert the disk dquot to the exportable format
-         */
-        xfs_qm_export_dquot(mp, &dqp->q_core, out);
-        xfs_qm_dqput(dqp);
-        return (error ? XFS_ERROR(EFAULT) : 0);
-}
 STATIC int
 xfs_qm_log_quotaoff_end(
        xfs_mount_t             *mp,
@@ -759,50 +714,66 @@ error0:
 }
-/*
+int
- * Translate an internal style on-disk-dquot to the exportable format.
+xfs_qm_scall_getquota(
- * The main differences are that the counters/limits are all in Basic
+        struct xfs_mount        *mp,
- * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
+        xfs_dqid_t              id,
- * to be converted to the native endianness.
+        uint                    type,
- */
-STATIC void
-xfs_qm_export_dquot(
-        xfs_mount_t             *mp,
-        xfs_disk_dquot_t        *src,
        struct fs_disk_quota    *dst)
 {
+        struct xfs_dquot        *dqp;
+        int                     error;
+        /*
+         * Try to get the dquot. We don't want it allocated on disk, so
+         * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+         * exist, we'll get ENOENT back.
+         */
+        error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+        if (error)
+                return error;
+        /*
+         * If everything's NULL, this dquot doesn't quite exist as far as
+         * our utility programs are concerned.
+         */
+        if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+                error = XFS_ERROR(ENOENT);
+                goto out_put;
+        }
        memset(dst, 0, sizeof(*dst));
-        dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
+        dst->d_version = FS_DQUOT_VERSION;
-        dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
+        dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
-        dst->d_id = be32_to_cpu(src->d_id);
+        dst->d_id = be32_to_cpu(dqp->q_core.d_id);
        dst->d_blk_hardlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
+                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
        dst->d_blk_softlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
+                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
-        dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
+        dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
-        dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
+        dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-        dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
+        dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
-        dst->d_icount = be64_to_cpu(src->d_icount);
+        dst->d_icount = dqp->q_res_icount;
-        dst->d_btimer = be32_to_cpu(src->d_btimer);
+        dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
-        dst->d_itimer = be32_to_cpu(src->d_itimer);
+        dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
-        dst->d_iwarns = be16_to_cpu(src->d_iwarns);
+        dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
-        dst->d_bwarns = be16_to_cpu(src->d_bwarns);
+        dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
        dst->d_rtb_hardlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
+                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
        dst->d_rtb_softlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
+                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
-        dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
+        dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
-        dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
+        dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-        dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
+        dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
        /*
         * Internally, we don't reset all the timers when quota enforcement
         * gets turned off. No need to confuse the user level code,
         * so return zeroes in that case.
         */
-        if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+        if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
            (!XFS_IS_OQUOTA_ENFORCED(mp) &&
-                        (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+                        (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
                dst->d_btimer = 0;
                dst->d_itimer = 0;
                dst->d_rtbtimer = 0;
@@ -823,6 +794,9 @@ xfs_qm_export_dquot(
                }
        }
 #endif
+out_put:
+        xfs_qm_dqput(dqp);
+        return error;
 }
 STATIC uint
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 8a0807e0f979..b50ec5b95d5a 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -174,6 +174,8 @@ typedef struct xfs_qoff_logformat {
 #define XFS_UQUOTA_ACTIVE       0x0100  /* uquotas are being turned off */
 #define XFS_PQUOTA_ACTIVE       0x0200  /* pquotas are being turned off */
 #define XFS_GQUOTA_ACTIVE       0x0400  /* gquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE    \
+        (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
 /*
 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d716..6d86219d93da 100644
--- a/fs/xfs/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
@@ -24,17 +24,6 @@
 */
 #define XFS_DQITER_MAP_SIZE     10
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
-                                 (__psunsigned_t)(id)) & \
-                                (xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
-                                     (xfs_Gqm->qm_usr_dqhtable + \
-                                      XFS_DQ_HASHVAL(mp, id)) : \
-                                     (xfs_Gqm->qm_grp_dqhtable + \
-                                      XFS_DQ_HASHVAL(mp, id)))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
        !dqp->q_core.d_blk_hardlimit && \
        !dqp->q_core.d_blk_softlimit && \
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 87323f1ded64..ca4f31534a0a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -183,6 +183,7 @@ error_cancel:
                oblocks = map.br_startoff + map.br_blockcount;
        }
        return 0;
 error:
        return error;
 }
@@ -2139,11 +2140,9 @@ xfs_rtfree_extent(
        xfs_buf_t       *sumbp;         /* summary file block buffer */
        mp = tp->t_mountp;
-        /*
-         * Synchronize by locking the bitmap inode.
+        ASSERT(mp->m_rbmip->i_itemp != NULL);
-         */
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 #if defined(__KERNEL__) && defined(DEBUG)
        /*
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index cb6ae715814a..f429d9d5d325 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -529,7 +529,6 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
 #define XFS_BB_TO_FSB(mp,bb)    \
        (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
 #define XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
-#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
 /*
 * File system block to byte conversions.
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc5861932..ce372b7d5644 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -20,9 +20,18 @@
 DEFINE_PER_CPU(struct xfsstats, xfsstats);
+static int counter_val(int idx)
+{
+        int val = 0, cpu;
+        for_each_possible_cpu(cpu)
+                val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+        return val;
+}
 static int xfs_stat_proc_show(struct seq_file *m, void *v)
 {
-        int             c, i, j, val;
+        int             i, j;
        __uint64_t      xs_xstrat_bytes = 0;
        __uint64_t      xs_write_bytes = 0;
        __uint64_t      xs_read_bytes = 0;
@@ -50,20 +59,16 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
                { "abtc2",              XFSSTAT_END_ABTC_V2             },
                { "bmbt2",              XFSSTAT_END_BMBT_V2             },
                { "ibt2",               XFSSTAT_END_IBT_V2              },
+                /* we print both series of quota information together */
+                { "qm",                 XFSSTAT_END_QM                  },
        };
        /* Loop over all stats groups */
-        for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+        for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
                seq_printf(m, "%s", xstats[i].desc);
                /* inner loop does each group */
-                while (j < xstats[i].endpoint) {
+                for (; j < xstats[i].endpoint; j++)
-                        val = 0;
+                        seq_printf(m, " %u", counter_val(j));
-                        /* sum over all cpus */
-                        for_each_possible_cpu(c)
-                                val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-                        seq_printf(m, " %u", val);
-                        j++;
-                }
                seq_putc(m, '\n');
        }
        /* extra precision counters */
@@ -97,6 +102,58 @@ static const struct file_operations xfs_stat_proc_fops = {
        .release        = single_release,
 };
+/* legacy quota interfaces */
+#ifdef CONFIG_XFS_QUOTA
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+        /* maximum; incore; ratio free to inuse; freelist */
+        seq_printf(m, "%d\t%d\t%d\t%u\n",
+                        0,
+                        counter_val(XFSSTAT_END_XQMSTAT),
+                        0,
+                        counter_val(XFSSTAT_END_XQMSTAT + 1));
+        return 0;
+}
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, xqm_proc_show, NULL);
+}
+static const struct file_operations xqm_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xqm_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+/* legacy quota stats interface no 2 */
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+        int j;
+        seq_printf(m, "qm");
+        for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+                seq_printf(m, " %u", counter_val(j));
+        seq_putc(m, '\n');
+        return 0;
+}
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, xqmstat_proc_show, NULL);
+}
+static const struct file_operations xqmstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xqmstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif /* CONFIG_XFS_QUOTA */
 int
 xfs_init_procfs(void)
 {
@@ -105,10 +162,24 @@ xfs_init_procfs(void)
        if (!proc_create("fs/xfs/stat", 0, NULL,
                         &xfs_stat_proc_fops))
-                goto out_remove_entry;
+                goto out_remove_xfs_dir;
+#ifdef CONFIG_XFS_QUOTA
+        if (!proc_create("fs/xfs/xqmstat", 0, NULL,
+                         &xqmstat_proc_fops))
+                goto out_remove_stat_file;
+        if (!proc_create("fs/xfs/xqm", 0, NULL,
+                         &xqm_proc_fops))
+                goto out_remove_xqmstat_file;
+#endif
        return 0;
- out_remove_entry:
+#ifdef CONFIG_XFS_QUOTA
+ out_remove_xqmstat_file:
+        remove_proc_entry("fs/xfs/xqmstat", NULL);
+ out_remove_stat_file:
+        remove_proc_entry("fs/xfs/stat", NULL);
+#endif
+ out_remove_xfs_dir:
        remove_proc_entry("fs/xfs", NULL);
 out:
        return -ENOMEM;
@@ -117,6 +188,10 @@ xfs_init_procfs(void)
 void
 xfs_cleanup_procfs(void)
 {
+#ifdef CONFIG_XFS_QUOTA
+        remove_proc_entry("fs/xfs/xqm", NULL);
+        remove_proc_entry("fs/xfs/xqmstat", NULL);
+#endif
        remove_proc_entry("fs/xfs/stat", NULL);
        remove_proc_entry("fs/xfs", NULL);
 }
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1a..c03ad38ceaeb 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,6 +183,16 @@ struct xfsstats {
        __uint32_t              xs_ibt_2_alloc;
        __uint32_t              xs_ibt_2_free;
        __uint32_t              xs_ibt_2_moves;
+#define XFSSTAT_END_XQMSTAT             (XFSSTAT_END_IBT_V2+6)
+        __uint32_t              xs_qm_dqreclaims;
+        __uint32_t              xs_qm_dqreclaim_misses;
+        __uint32_t              xs_qm_dquot_dups;
+        __uint32_t              xs_qm_dqcachemisses;
+        __uint32_t              xs_qm_dqcachehits;
+        __uint32_t              xs_qm_dqwants;
+#define XFSSTAT_END_QM                  (XFSSTAT_END_XQMSTAT+2)
+        __uint32_t              xs_qm_dquot;
+        __uint32_t              xs_qm_dquot_unused;
 /* Extra precision counters */
        __uint64_t              xs_xstrat_bytes;
        __uint64_t              xs_write_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index baf40e378d35..dab9a5f6dfd6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -324,10 +324,9 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
                        mp->m_flags |= XFS_MOUNT_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                        mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                        mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
-                                          XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                        mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
-                                          XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                        mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
-                                          XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
                           !strcmp(this_char, MNTOPT_UQUOTA) ||
                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
@@ -760,6 +759,36 @@ xfs_setup_devices(
        return 0;
 }
+STATIC int
+xfs_init_mount_workqueues(
+        struct xfs_mount        *mp)
+{
+        mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
+                        WQ_MEM_RECLAIM, 0, mp->m_fsname);
+        if (!mp->m_data_workqueue)
+                goto out;
+        mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
+                        WQ_MEM_RECLAIM, 0, mp->m_fsname);
+        if (!mp->m_unwritten_workqueue)
+                goto out_destroy_data_iodone_queue;
+        return 0;
+out_destroy_data_iodone_queue:
+        destroy_workqueue(mp->m_data_workqueue);
+out:
+        return -ENOMEM;
+}
+STATIC void
+xfs_destroy_mount_workqueues(
+        struct xfs_mount        *mp)
+{
+        destroy_workqueue(mp->m_data_workqueue);
+        destroy_workqueue(mp->m_unwritten_workqueue);
+}
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -834,91 +863,58 @@ xfs_fs_inode_init_once(
 }
 /*
- * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * This is called by the VFS when dirtying inode metadata.  This can happen
- * we catch unlogged VFS level updates to the inode.
+ * for a few reasons, but we only care about timestamp updates, given that
+ * we handled the rest ourselves.  In theory no other calls should happen,
+ * but for example generic_write_end() keeps dirtying the inode after
+ * updating i_size.  Thus we check that the flags are exactly I_DIRTY_SYNC,
+ * and skip this call otherwise.
 *
- * We need the barrier() to maintain correct ordering between unlogged
+ * We'll hopefull get a different method just for updating timestamps soon,
- * updates and the transaction commit code that clears the i_update_core
+ * at which point this hack can go away, and maybe we'll also get real
- * field. This requires all updates to be completed before marking the
+ * error handling here.
- * inode dirty.
 */
 STATIC void
 xfs_fs_dirty_inode(
-        struct inode    *inode,
-        int             flags)
-{
-        barrier();
-        XFS_I(inode)->i_update_core = 1;
-}
-STATIC int
-xfs_fs_write_inode(
        struct inode            *inode,
-        struct writeback_control *wbc)
+        int                     flags)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-        int                     error = EAGAIN;
+        struct xfs_trans        *tp;
+        int                     error;
-        trace_xfs_write_inode(ip);
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
-        if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
+        if (flags != I_DIRTY_SYNC)
-                /*
+                return;
-                 * Make sure the inode has made it it into the log.  Instead
-                 * of forcing it all the way to stable storage using a
-                 * synchronous transaction we let the log force inside the
-                 * ->sync_fs call do that for thus, which reduces the number
-                 * of synchronous log forces dramatically.
-                 */
-                error = xfs_log_dirty_inode(ip, NULL, 0);
-                if (error)
-                        goto out;
-                return 0;
-        } else {
-                if (!ip->i_update_core)
-                        return 0;
-                /*
+        trace_xfs_dirty_inode(ip);
-                 * We make this non-blocking if the inode is contended, return
-                 * EAGAIN to indicate to the caller that they did not succeed.
-                 * This prevents the flush path from blocking on inodes inside
-                 * another operation right now, they get caught later by
-                 * xfs_sync.
-                 */
-                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
-                        goto out;
-                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-                        goto out_unlock;
+        error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+        if (error) {
-                /*
+                xfs_trans_cancel(tp, 0);
-                 * Now we have the flush lock and the inode is not pinned, we
+                goto trouble;
-                 * can check if the inode is really clean as we know that
-                 * there are no pending transaction completions, it is not
-                 * waiting on the delayed write queue and there is no IO in
-                 * progress.
-                 */
-                if (xfs_inode_clean(ip)) {
-                        xfs_ifunlock(ip);
-                        error = 0;
-                        goto out_unlock;
-                }
-                error = xfs_iflush(ip, SYNC_TRYLOCK);
        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
- out_unlock:
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
- out:
        /*
-         * if we failed to write out the inode then mark
+         * Grab all the latest timestamps from the Linux inode.
-         * it dirty again so we'll try again later.
         */
+        ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+        ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+        ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+        ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+        ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+        ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+        error = xfs_trans_commit(tp, 0);
        if (error)
-                xfs_mark_inode_dirty_sync(ip);
+                goto trouble;
-        return -error;
+        return;
+trouble:
+        xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino);
 }
 STATIC void
@@ -954,6 +950,22 @@ xfs_fs_evict_inode(
        xfs_inactive(ip);
 }
+/*
+ * We do an unlocked check for XFS_IDONTCACHE here because we are already
+ * serialised against cache hits here via the inode->i_lock and igrab() in
+ * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be
+ * racing with us, and it avoids needing to grab a spinlock here for every inode
+ * we drop the final reference on.
+ */
+STATIC int
+xfs_fs_drop_inode(
+        struct inode            *inode)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
+}
 STATIC void
 xfs_free_fsname(
        struct xfs_mount        *mp)
@@ -983,6 +995,7 @@ xfs_fs_put_super(
        xfs_unmountfs(mp);
        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
+        xfs_destroy_mount_workqueues(mp);
        xfs_close_devices(mp);
        xfs_free_fsname(mp);
        kfree(mp);
@@ -1309,10 +1322,14 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
-        error = xfs_icsb_init_counters(mp);
+        error = xfs_init_mount_workqueues(mp);
        if (error)
                goto out_close_devices;
+        error = xfs_icsb_init_counters(mp);
+        if (error)
+                goto out_destroy_workqueues;
        error = xfs_readsb(mp, flags);
        if (error)
                goto out_destroy_counters;
@@ -1376,6 +1393,8 @@ xfs_fs_fill_super(
        xfs_freesb(mp);
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
+out_destroy_workqueues:
+        xfs_destroy_mount_workqueues(mp);
 out_close_devices:
        xfs_close_devices(mp);
 out_free_fsname:
@@ -1429,8 +1448,8 @@ static const struct super_operations xfs_super_operations = {
        .alloc_inode            = xfs_fs_alloc_inode,
        .destroy_inode          = xfs_fs_destroy_inode,
        .dirty_inode            = xfs_fs_dirty_inode,
-        .write_inode            = xfs_fs_write_inode,
        .evict_inode            = xfs_fs_evict_inode,
+        .drop_inode             = xfs_fs_drop_inode,
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
        .freeze_fs              = xfs_fs_freeze,
@@ -1604,12 +1623,28 @@ xfs_init_workqueues(void)
        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
        if (!xfs_syncd_wq)
                return -ENOMEM;
+        /*
+         * The allocation workqueue can be used in memory reclaim situations
+         * (writepage path), and parallelism is only limited by the number of
+         * AGs in all the filesystems mounted. Hence use the default large
+         * max_active value for this workqueue.
+         */
+        xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
+        if (!xfs_alloc_wq)
+                goto out_destroy_syncd;
        return 0;
+out_destroy_syncd:
+        destroy_workqueue(xfs_syncd_wq);
+        return -ENOMEM;
 }
 STATIC void
 xfs_destroy_workqueues(void)
 {
+        destroy_workqueue(xfs_alloc_wq);
        destroy_workqueue(xfs_syncd_wq);
 }
@@ -1651,13 +1686,17 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
-        vfs_initquota();
+        error = xfs_qm_init();
+        if (error)
+                goto out_sysctl_unregister;
        error = register_filesystem(&xfs_fs_type);
        if (error)
-                goto out_sysctl_unregister;
+                goto out_qm_exit;
        return 0;
+ out_qm_exit:
+        xfs_qm_exit();
 out_sysctl_unregister:
        xfs_sysctl_unregister();
 out_cleanup_procfs:
@@ -1679,7 +1718,7 @@ init_xfs_fs(void)
 STATIC void __exit
 exit_xfs_fs(void)
 {
-        vfs_exitquota();
+        xfs_qm_exit();
        unregister_filesystem(&xfs_fs_type);
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999e..09b0c26b2245 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -21,13 +21,11 @@
 #include <linux/exportfs.h>
 #ifdef CONFIG_XFS_QUOTA
-extern void xfs_qm_init(void);
+extern int xfs_qm_init(void);
 extern void xfs_qm_exit(void);
-# define vfs_initquota()        xfs_qm_init()
-# define vfs_exitquota()        xfs_qm_exit()
 #else
-# define vfs_initquota()        do { } while (0)
+# define xfs_qm_init()  (0)
-# define vfs_exitquota()        do { } while (0)
+# define xfs_qm_exit()  do { } while (0)
 #endif
 #ifdef CONFIG_XFS_POSIX_ACL
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 40b75eecd2b4..205ebcb34d9e 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -336,32 +336,6 @@ xfs_sync_fsdata(
        return error;
 }
-int
-xfs_log_dirty_inode(
-        struct xfs_inode        *ip,
-        struct xfs_perag        *pag,
-        int                     flags)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_trans        *tp;
-        int                     error;
-        if (!ip->i_update_core)
-                return 0;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-        error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        return xfs_trans_commit(tp, 0);
-}
 /*
 * When remounting a filesystem read-only or freezing the filesystem, we have
 * two phases to execute. This first phase is syncing the data before we
@@ -385,16 +359,6 @@ xfs_quiesce_data(
 {
        int                     error, error2 = 0;
-        /*
-         * Log all pending size and timestamp updates.  The vfs writeback
-         * code is supposed to do this, but due to its overagressive
-         * livelock detection it will skip inodes where appending writes
-         * were written out in the first non-blocking sync phase if their
-         * completion took long enough that it happened after taking the
-         * timestamp for the cut-off in the blocking phase.
-         */
-        xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
        /* force out the log */
        xfs_log_force(mp, XFS_LOG_SYNC);
@@ -913,17 +877,15 @@ reclaim:
         * can reference the inodes in the cache without taking references.
         *
         * We make that OK here by ensuring that we wait until the inode is
-         * unlocked after the lookup before we go ahead and free it.  We get
+         * unlocked after the lookup before we go ahead and free it.
-         * both the ilock and the iolock because the code may need to drop the
-         * ilock one but will still hold the iolock.
         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_qm_dqdetach(ip);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        xfs_inode_free(ip);
-        return error;
+        return error;
 }
 /*
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index fa965479d788..941202e7ac6e 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
 void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bb134a819930..06838c42b2a0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -580,7 +580,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
 DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_dirty_inode);
 DEFINE_INODE_EVENT(xfs_evict_inode);
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -627,16 +627,19 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, dp_ino)
+                __field(int, namelen)
                __dynamic_array(char, name, name->len)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(dp)->i_sb->s_dev;
                __entry->dp_ino = dp->i_ino;
+                __entry->namelen = name->len;
                memcpy(__get_str(name), name->name, name->len);
        ),
-        TP_printk("dev %d:%d dp ino 0x%llx name %s",
+        TP_printk("dev %d:%d dp ino 0x%llx name %.*s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->dp_ino,
+                  __entry->namelen,
                  __get_str(name))
 )
@@ -658,6 +661,8 @@ TRACE_EVENT(xfs_rename,
                __field(dev_t, dev)
                __field(xfs_ino_t, src_dp_ino)
                __field(xfs_ino_t, target_dp_ino)
+                __field(int, src_namelen)
+                __field(int, target_namelen)
                __dynamic_array(char, src_name, src_name->len)
                __dynamic_array(char, target_name, target_name->len)
        ),
@@ -665,15 +670,20 @@ TRACE_EVENT(xfs_rename,
                __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
                __entry->src_dp_ino = src_dp->i_ino;
                __entry->target_dp_ino = target_dp->i_ino;
+                __entry->src_namelen = src_name->len;
+                __entry->target_namelen = target_name->len;
                memcpy(__get_str(src_name), src_name->name, src_name->len);
-                memcpy(__get_str(target_name), target_name->name, target_name->len);
+                memcpy(__get_str(target_name), target_name->name,
+                        target_name->len);
        ),
        TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
-                  " src name %s target name %s",
+                  " src name %.*s target name %.*s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->src_dp_ino,
                  __entry->target_dp_ino,
+                  __entry->src_namelen,
                  __get_str(src_name),
+                  __entry->target_namelen,
                  __get_str(target_name))
 )
@@ -741,10 +751,10 @@ DEFINE_DQUOT_EVENT(xfs_dqalloc);
 DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
 DEFINE_DQUOT_EVENT(xfs_dqread);
 DEFINE_DQUOT_EVENT(xfs_dqread_fail);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
+DEFINE_DQUOT_EVENT(xfs_dqget_dup);
 DEFINE_DQUOT_EVENT(xfs_dqput);
 DEFINE_DQUOT_EVENT(xfs_dqput_wait);
 DEFINE_DQUOT_EVENT(xfs_dqput_free);
@@ -782,12 +792,12 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserveq = list_empty(&log->l_reserveq);
+                __entry->reserveq = list_empty(&log->l_reserve_head.waiters);
-                __entry->writeq = list_empty(&log->l_writeq);
+                __entry->writeq = list_empty(&log->l_write_head.waiters);
-                xlog_crack_grant_head(&log->l_grant_reserve_head,
+                xlog_crack_grant_head(&log->l_reserve_head.grant,
                                &__entry->grant_reserve_cycle,
                                &__entry->grant_reserve_bytes);
-                xlog_crack_grant_head(&log->l_grant_write_head,
+                xlog_crack_grant_head(&log->l_write_head.grant,
                                &__entry->grant_write_cycle,
                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
@@ -826,20 +836,14 @@ DEFINE_EVENT(xfs_loggrant_class, name, \
        TP_ARGS(log, tic))
 DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
 DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
-DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
 DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -1414,7 +1418,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
 DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
 DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
-DECLARE_EVENT_CLASS(xfs_dir2_class,
+DECLARE_EVENT_CLASS(xfs_da_class,
        TP_PROTO(struct xfs_da_args *args),
        TP_ARGS(args),
        TP_STRUCT__entry(
@@ -1449,7 +1453,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_class,
 )
 #define DEFINE_DIR2_EVENT(name) \
-DEFINE_EVENT(xfs_dir2_class, name, \
+DEFINE_EVENT(xfs_da_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
        TP_ARGS(args))
 DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
@@ -1478,6 +1482,64 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
 DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
 DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+#define DEFINE_ATTR_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+        TP_PROTO(struct xfs_da_args *args), \
+        TP_ARGS(args))
+DEFINE_ATTR_EVENT(xfs_attr_sf_add);
+DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_sf_create);
+DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
+DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_node_replace);
+DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+#define DEFINE_DA_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+        TP_PROTO(struct xfs_da_args *args), \
+        TP_ARGS(args))
+DEFINE_DA_EVENT(xfs_da_split);
+DEFINE_DA_EVENT(xfs_da_join);
+DEFINE_DA_EVENT(xfs_da_link_before);
+DEFINE_DA_EVENT(xfs_da_link_after);
+DEFINE_DA_EVENT(xfs_da_unlink_back);
+DEFINE_DA_EVENT(xfs_da_unlink_forward);
+DEFINE_DA_EVENT(xfs_da_root_split);
+DEFINE_DA_EVENT(xfs_da_root_join);
+DEFINE_DA_EVENT(xfs_da_node_add);
+DEFINE_DA_EVENT(xfs_da_node_create);
+DEFINE_DA_EVENT(xfs_da_node_split);
+DEFINE_DA_EVENT(xfs_da_node_remove);
+DEFINE_DA_EVENT(xfs_da_node_rebalance);
+DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_swap_lastblock);
+DEFINE_DA_EVENT(xfs_da_grow_inode);
+DEFINE_DA_EVENT(xfs_da_shrink_inode);
 DECLARE_EVENT_CLASS(xfs_dir2_space_class,
        TP_PROTO(struct xfs_da_args *args, int idx),
        TP_ARGS(args, idx),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7adcdf15ae0c..103b00c90004 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -681,7 +681,6 @@ xfs_trans_reserve(
        uint            flags,
        uint            logcount)
 {
-        int             log_flags;
        int             error = 0;
        int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -707,24 +706,32 @@ xfs_trans_reserve(
         * Reserve the log space needed for this transaction.
         */
        if (logspace > 0) {
-                ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
+                bool    permanent = false;
-                ASSERT((tp->t_log_count == 0) ||
-                        (tp->t_log_count == logcount));
+                ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
+                ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
                if (flags & XFS_TRANS_PERM_LOG_RES) {
-                        log_flags = XFS_LOG_PERM_RESERV;
                        tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+                        permanent = true;
                } else {
                        ASSERT(tp->t_ticket == NULL);
                        ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
-                        log_flags = 0;
                }
-                error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
+                if (tp->t_ticket != NULL) {
-                                        &tp->t_ticket,
+                        ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
-                                        XFS_TRANSACTION, log_flags, tp->t_type);
+                        error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
-                if (error) {
+                } else {
-                        goto undo_blocks;
+                        error = xfs_log_reserve(tp->t_mountp, logspace,
+                                                logcount, &tp->t_ticket,
+                                                XFS_TRANSACTION, permanent,
+                                                tp->t_type);
                }
+                if (error)
+                        goto undo_blocks;
                tp->t_log_res = logspace;
                tp->t_log_count = logcount;
        }
@@ -752,6 +759,8 @@ xfs_trans_reserve(
         */
 undo_log:
        if (logspace > 0) {
+                int             log_flags;
                if (flags & XFS_TRANS_PERM_LOG_RES) {
                        log_flags = XFS_LOG_REL_PERM_RESERV;
                } else {
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index ed9252bcdac9..1dead07f092c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -611,50 +611,6 @@ xfs_ail_push_all(
 }
 /*
- * This is to be called when an item is unlocked that may have
- * been in the AIL.  It will wake up the first member of the AIL
- * wait list if this item's unlocking might allow it to progress.
- * If the item is in the AIL, then we need to get the AIL lock
- * while doing our checking so we don't race with someone going
- * to sleep waiting for this event in xfs_trans_push_ail().
- */
-void
-xfs_trans_unlocked_item(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        xfs_log_item_t  *min_lip;
-        /*
-         * If we're forcibly shutting down, we may have
-         * unlocked log items arbitrarily. The last thing
-         * we want to do is to move the tail of the log
-         * over some potentially valid data.
-         */
-        if (!(lip->li_flags & XFS_LI_IN_AIL) ||
-            XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
-                return;
-        }
-        /*
-         * This is the one case where we can call into xfs_ail_min()
-         * without holding the AIL lock because we only care about the
-         * case where we are at the tail of the AIL.  If the object isn't
-         * at the tail, it doesn't matter what result we get back.  This
-         * is slightly racy because since we were just unlocked, we could
-         * go to sleep between the call to xfs_ail_min and the call to
-         * xfs_log_move_tail, have someone else lock us, commit to us disk,
-         * move us out of the tail of the AIL, and then we wake up.  However,
-         * the call to xfs_log_move_tail() doesn't do anything if there's
-         * not enough free space to wake people up so we're safe calling it.
-         */
-        min_lip = xfs_ail_min(ailp);
-        if (min_lip == lip)
-                xfs_log_move_tail(ailp->xa_mount, 1);
-}       /* xfs_trans_unlocked_item */
-/*
 * xfs_trans_ail_update - bulk AIL insertion operation.
 *
 * @xfs_trans_ail_update takes an array of log items that all need to be
@@ -685,7 +641,6 @@ xfs_trans_ail_update_bulk(
        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *mlip;
-        xfs_lsn_t               tail_lsn;
        int                     mlip_changed = 0;
        int                     i;
        LIST_HEAD(tmp);
@@ -712,22 +667,12 @@ xfs_trans_ail_update_bulk(
        if (!list_empty(&tmp))
                xfs_ail_splice(ailp, cur, &tmp, lsn);
+        spin_unlock(&ailp->xa_lock);
-        if (!mlip_changed) {
+        if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
-                spin_unlock(&ailp->xa_lock);
+                xlog_assign_tail_lsn(ailp->xa_mount);
-                return;
+                xfs_log_space_wake(ailp->xa_mount);
        }
-        /*
-         * It is not safe to access mlip after the AIL lock is dropped, so we
-         * must get a copy of li_lsn before we do so.  This is especially
-         * important on 32-bit platforms where accessing and updating 64-bit
-         * values like li_lsn is not atomic.
-         */
-        mlip = xfs_ail_min(ailp);
-        tail_lsn = mlip->li_lsn;
-        spin_unlock(&ailp->xa_lock);
-        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
 }
 /*
@@ -758,7 +703,6 @@ xfs_trans_ail_delete_bulk(
        int                     nr_items) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *mlip;
-        xfs_lsn_t               tail_lsn;
        int                     mlip_changed = 0;
        int                     i;
@@ -785,23 +729,12 @@ xfs_trans_ail_delete_bulk(
                if (mlip == lip)
                        mlip_changed = 1;
        }
+        spin_unlock(&ailp->xa_lock);
-        if (!mlip_changed) {
+        if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
-                spin_unlock(&ailp->xa_lock);
+                xlog_assign_tail_lsn(ailp->xa_mount);
-                return;
+                xfs_log_space_wake(ailp->xa_mount);
        }
-        /*
-         * It is not safe to access mlip after the AIL lock is dropped, so we
-         * must get a copy of li_lsn before we do so.  This is especially
-         * important on 32-bit platforms where accessing and updating 64-bit
-         * values like li_lsn is not atomic. It is possible we've emptied the
-         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
-         */
-        mlip = xfs_ail_min(ailp);
-        tail_lsn = mlip ? mlip->li_lsn : 0;
-        spin_unlock(&ailp->xa_lock);
-        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
 }
 /*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 475a4ded4f41..1302d1d95a58 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -463,19 +463,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * Default to a normal brelse() call if the tp is NULL.
         */
        if (tp == NULL) {
-                struct xfs_log_item     *lip = bp->b_fspriv;
                ASSERT(bp->b_transp == NULL);
-                /*
-                 * If there's a buf log item attached to the buffer,
-                 * then let the AIL know that the buffer is being
-                 * unlocked.
-                 */
-                if (lip != NULL && lip->li_type == XFS_LI_BUF) {
-                        bip = bp->b_fspriv;
-                        xfs_trans_unlocked_item(bip->bli_item.li_ailp, lip);
-                }
                xfs_buf_relse(bp);
                return;
        }
@@ -550,21 +538,10 @@ xfs_trans_brelse(xfs_trans_t	*tp,
                ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
                ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
                xfs_buf_item_relse(bp);
-                bip = NULL;
-        }
-        bp->b_transp = NULL;
-        /*
-         * If we've still got a buf log item on the buffer, then
-         * tell the AIL that the buffer is being unlocked.
-         */
-        if (bip != NULL) {
-                xfs_trans_unlocked_item(bip->bli_item.li_ailp,
-                                        (xfs_log_item_t*)bip);
        }
+        bp->b_transp = NULL;
        xfs_buf_relse(bp);
-        return;
 }
 /*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c4ba366d24e6..279099717ed2 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -605,7 +605,7 @@ xfs_trans_dqresv(
        time_t          timer;
        xfs_qwarncnt_t  warns;
        xfs_qwarncnt_t  warnlimit;
-        xfs_qcnt_t      count;
+        xfs_qcnt_t      total_count;
        xfs_qcnt_t      *resbcountp;
        xfs_quotainfo_t *q = mp->m_quotainfo;
@@ -648,13 +648,12 @@ xfs_trans_dqresv(
                         * hardlimit or exceed the timelimit if we allocate
                         * nblks.
                         */
-                        if (hardlimit > 0ULL &&
+                        total_count = *resbcountp + nblks;
-                            hardlimit < nblks + *resbcountp) {
+                        if (hardlimit && total_count > hardlimit) {
                                xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
                                goto error_return;
                        }
-                        if (softlimit > 0ULL &&
+                        if (softlimit && total_count > softlimit) {
-                            softlimit < nblks + *resbcountp) {
                                if ((timer != 0 && get_seconds() > timer) ||
                                    (warns != 0 && warns >= warnlimit)) {
                                        xfs_quota_warn(mp, dqp,
@@ -666,7 +665,7 @@ xfs_trans_dqresv(
                        }
                }
                if (ninos > 0) {
-                        count = be64_to_cpu(dqp->q_core.d_icount);
+                        total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
                        timer = be32_to_cpu(dqp->q_core.d_itimer);
                        warns = be16_to_cpu(dqp->q_core.d_iwarns);
                        warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
@@ -677,13 +676,11 @@ xfs_trans_dqresv(
                        if (!softlimit)
                                softlimit = q->qi_isoftlimit;
-                        if (hardlimit > 0ULL &&
+                        if (hardlimit && total_count > hardlimit) {
-                            hardlimit < ninos + count) {
                                xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
                                goto error_return;
                        }
-                        if (softlimit > 0ULL &&
+                        if (softlimit && total_count > softlimit) {
-                            softlimit < ninos + count) {
                                if  ((timer != 0 && get_seconds() > timer) ||
                                     (warns != 0 && warns >= warnlimit)) {
                                        xfs_quota_warn(mp, dqp,
@@ -878,7 +875,7 @@ STATIC void
 xfs_trans_alloc_dqinfo(
        xfs_trans_t     *tp)
 {
-        tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+        tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
 }
 void
@@ -887,6 +884,6 @@ xfs_trans_free_dqinfo(
 {
        if (!tp->t_dqinfo)
                return;
-        kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
+        kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
        tp->t_dqinfo = NULL;
 }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 32f0288ae10f..7a7442c03f2b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -95,10 +95,14 @@ xfs_trans_ichgtime(
        if ((flags & XFS_ICHGTIME_MOD) &&
            !timespec_equal(&inode->i_mtime, &tv)) {
                inode->i_mtime = tv;
+                ip->i_d.di_mtime.t_sec = tv.tv_sec;
+                ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
        }
        if ((flags & XFS_ICHGTIME_CHG) &&
            !timespec_equal(&inode->i_ctime, &tv)) {
                inode->i_ctime = tv;
+                ip->i_d.di_ctime.t_sec = tv.tv_sec;
+                ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
        }
 }
@@ -126,12 +130,12 @@ xfs_trans_log_inode(
        /*
         * Always OR in the bits from the ili_last_fields field.
         * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
-         * routines in the eventual clearing of the ilf_fields bits.
+         * routines in the eventual clearing of the ili_fields bits.
         * See the big comment in xfs_iflush() for an explanation of
         * this coordination mechanism.
         */
        flags |= ip->i_itemp->ili_last_fields;
-        ip->i_itemp->ili_format.ilf_fields |= flags;
+        ip->i_itemp->ili_fields |= flags;
 }
 #ifdef XFS_TRANS_DEBUG
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 44820b9fcb43..8ab2ced415f1 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -104,9 +104,6 @@ void			xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                    xfs_ail_push_all(struct xfs_ail *);
 xfs_lsn_t               xfs_ail_min_lsn(struct xfs_ail *ailp);
-void                    xfs_trans_unlocked_item(struct xfs_ail *,
-                                        xfs_log_item_t *);
 struct xfs_log_item *   xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur,
                                        xfs_lsn_t lsn);
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 7c220b4227bc..db14d0c08682 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -22,7 +22,6 @@
 struct file;
 struct xfs_inode;
-struct xfs_iomap;
 struct attrlist_cursor_kern;
 /*
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 0c877cbde142..447e146b2ba6 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -10,7 +10,6 @@ struct kiocb;
 struct pipe_inode_info;
 struct uio;
 struct xfs_inode;
-struct xfs_iomap;
 int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
@@ -49,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                int flags, struct xfs_iomap *iomapp, int *niomaps);
 void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
                xfs_off_t last, int fiopt);
 int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,