372 files changed, 14446 insertions, 6784 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6fcb1e7095cf..92828281a30b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -57,7 +57,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        buffer = kmap(page);
        offset = page_offset(page);
-        retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE);
+        retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
        if (retval < 0)
                goto done;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ab5547ff29a1..38d695d66a0b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,7 +37,6 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -231,10 +230,8 @@ v9fs_umount_begin(struct super_block *sb)
 {
        struct v9fs_session_info *v9ses;
-        lock_kernel();
        v9ses = sb->s_fs_info;
        v9fs_session_cancel(v9ses);
-        unlock_kernel();
 }
 static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 525da2e8f73b..0e7da7bb5d93 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,13 @@ config FS_POSIX_ACL
        bool
        default n
+source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
+source "fs/ocfs2/Kconfig"
+source "fs/btrfs/Kconfig"
+endif # BLOCK
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EMBEDDED
        default y
@@ -47,13 +54,6 @@ config FILE_LOCKING
          for filesystems like NFS and for the flock() system
          call. Disabling this option saves about 11k.
-source "fs/xfs/Kconfig"
-source "fs/gfs2/Kconfig"
-source "fs/ocfs2/Kconfig"
-source "fs/btrfs/Kconfig"
-endif # BLOCK
 source "fs/notify/Kconfig"
 source "fs/quota/Kconfig"
@@ -134,7 +134,7 @@ config TMPFS_POSIX_ACL
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
-                   (S390 && 64BIT) || BROKEN
+                   (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN
        help
          hugetlbfs is a filesystem backing for HugeTLB pages, based on
          ramfs. For architectures that support it, say Y here and read
@@ -186,32 +186,7 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/nilfs2/Kconfig"
-config NILFS2_FS
-        tristate "NILFS2 file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        select CRC32
-        help
-          NILFS2 is a log-structured file system (LFS) supporting continuous
-          snapshotting.  In addition to versioning capability of the entire
-          file system, users can even restore files mistakenly overwritten or
-          destroyed just a few seconds ago.  Since this file system can keep
-          consistency like conventional LFS, it achieves quick recovery after
-          system crashes.
-          NILFS2 creates a number of checkpoints every few seconds or per
-          synchronous write basis (unless there is no change).  Users can
-          select significant versions among continuously created checkpoints,
-          and can change them into snapshots which will be preserved for long
-          periods until they are changed back to checkpoints.  Each
-          snapshot is mountable as a read-only file system concurrently with
-          its writable mount, and this feature is convenient for online backup.
-          Some features including atime, extended attributes, and POSIX ACLs,
-          are not supported yet.
-          To compile this file system support as a module, choose M here: the
-          module will be called nilfs2.  If unsure, say N.
 endif # MISC_FILESYSTEMS
@@ -236,10 +211,12 @@ source "fs/nfsd/Kconfig"
 config LOCKD
        tristate
+        depends on FILE_LOCKING
 config LOCKD_V4
        bool
        depends on NFSD_V3 || NFS_V3
+        depends on FILE_LOCKING
        default y
 config EXPORTFS
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index a6665f37f456..9cc18775b832 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -1,3 +1,6 @@
+#include <linux/fs.h>
+#include <linux/adfs_fs.h>
 /* Internal data structures for ADFS */
 #define ADFS_FREE_FRAG           0
@@ -17,6 +20,58 @@
 struct buffer_head;
 /*
+ * adfs file system inode data in memory
+ */
+struct adfs_inode_info {
+        loff_t          mmu_private;
+        unsigned long   parent_id;      /* object id of parent          */
+        __u32           loadaddr;       /* RISC OS load address         */
+        __u32           execaddr;       /* RISC OS exec address         */
+        unsigned int    filetype;       /* RISC OS file type            */
+        unsigned int    attr;           /* RISC OS permissions          */
+        unsigned int    stamped:1;      /* RISC OS file has date/time   */
+        struct inode vfs_inode;
+};
+/*
+ * Forward-declare this
+ */
+struct adfs_discmap;
+struct adfs_dir_ops;
+/*
+ * ADFS file system superblock data in memory
+ */
+struct adfs_sb_info {
+        struct adfs_discmap *s_map;     /* bh list containing map                */
+        struct adfs_dir_ops *s_dir;     /* directory operations                  */
+        uid_t           s_uid;          /* owner uid                             */
+        gid_t           s_gid;          /* owner gid                             */
+        umode_t         s_owner_mask;   /* ADFS owner perm -> unix perm          */
+        umode_t         s_other_mask;   /* ADFS other perm -> unix perm          */
+        __u32           s_ids_per_zone; /* max. no ids in one zone               */
+        __u32           s_idlen;        /* length of ID in map                   */
+        __u32           s_map_size;     /* sector size of a map                  */
+        unsigned long   s_size;         /* total size (in blocks) of this fs     */
+        signed int      s_map2blk;      /* shift left by this for map->sector    */
+        unsigned int    s_log2sharesize;/* log2 share size                       */
+        __le32          s_version;      /* disc format version                   */
+        unsigned int    s_namelen;      /* maximum number of characters in name  */
+};
+static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
+{
+        return container_of(inode, struct adfs_inode_info, vfs_inode);
+}
+/*
 * Directory handling
 */
 struct adfs_dir {
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 4d4073447d1a..23aa52f548a0 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,15 +9,7 @@
 *
 *  Common directory handling for ADFS
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>          /* for file_fsync() */
 #include "adfs.h"
 /*
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 31df6adf0de6..bafc71222e25 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -9,15 +9,7 @@
 *
 *  E and F format directory handling
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
 #include "adfs.h"
 #include "dir_f.h"
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 139e0f345f18..1796bb352d05 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -7,15 +7,7 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
 #include "adfs.h"
 #include "dir_fplus.h"
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 8224d54a2afb..005ea34d1758 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -19,10 +19,6 @@
 *
 *  adfs regular file handling primitives           
 */
-#include <linux/fs.h>
-#include <linux/buffer_head.h>                  /* for file_fsync() */
-#include <linux/adfs_fs.h>
 #include "adfs.h"
 const struct file_operations adfs_file_operations = {
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 05b3a677201d..798cb071d132 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,17 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/mm.h>
 #include <linux/smp_lock.h>
-#include <linux/module.h>
 #include <linux/buffer_head.h>
 #include "adfs.h"
 /*
@@ -395,4 +386,3 @@ int adfs_write_inode(struct inode *inode, int wait)
        unlock_kernel();
        return ret;
 }
-MODULE_LICENSE("GPL");
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 568081b93f73..d1a5932bb0f1 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -7,14 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
 #include <asm/unaligned.h>
 #include "adfs.h"
 /*
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 0ec5aaf47aa7..6910a98bd73c 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -8,26 +8,13 @@
 * published by the Free Software Foundation.
 */
 #include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
-#include <linux/vfs.h>
 #include <linux/parser.h>
-#include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
-#include <asm/system.h>
-#include <stdarg.h>
 #include "adfs.h"
 #include "dir_f.h"
 #include "dir_fplus.h"
@@ -534,3 +521,4 @@ static void __exit exit_adfs_fs(void)
 module_init(init_adfs_fs)
 module_exit(exit_adfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9bd757774c9e..88067f36e5e7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -564,7 +564,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct afs_vnode *vnode, *dir;
-        struct afs_fid fid;
+        struct afs_fid uninitialized_var(fid);
        struct dentry *parent;
        struct key *key;
        void *dir_version;
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 210acafe4a9b..3ff8bdd18fb3 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -432,7 +432,6 @@ vfs_rejected_lock:
        list_del_init(&fl->fl_u.afs.link);
        if (list_empty(&vnode->granted_locks))
                afs_defer_unlock(vnode, key);
-        spin_unlock(&vnode->lock);
        goto abort_attempt;
 }
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 2d33a5f7d218..0dd4dafee10b 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <rxrpc/packet.h>
 #include "internal.h"
 #include "afs_fs.h"
@@ -54,6 +55,21 @@ int afs_abort_to_error(u32 abort_code)
        case 0x2f6df24:         return -ENOLCK;
        case 0x2f6df26:         return -ENOTEMPTY;
        case 0x2f6df78:         return -EDQUOT;
+        case RXKADINCONSISTENCY: return -EPROTO;
+        case RXKADPACKETSHORT:  return -EPROTO;
+        case RXKADLEVELFAIL:    return -EKEYREJECTED;
+        case RXKADTICKETLEN:    return -EKEYREJECTED;
+        case RXKADOUTOFSEQUENCE: return -EPROTO;
+        case RXKADNOAUTH:       return -EKEYREJECTED;
+        case RXKADBADKEY:       return -EKEYREJECTED;
+        case RXKADBADTICKET:    return -EKEYREJECTED;
+        case RXKADUNKNOWNKEY:   return -EKEYREJECTED;
+        case RXKADEXPIRED:      return -EKEYEXPIRED;
+        case RXKADSEALEDINCON:  return -EKEYREJECTED;
+        case RXKADDATALEN:      return -EKEYREJECTED;
+        case RXKADILLEGALLEVEL: return -EKEYREJECTED;
        default:                return -EREMOTEIO;
        }
 }
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index c52be53f6946..5ffb570cd3a8 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -17,7 +17,6 @@
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/mnt_namespace.h>
 #include "internal.h"
diff --git a/fs/afs/super.c b/fs/afs/super.c
index ad0514d0115f..e1ea1c240b6a 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/parser.h>
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index ec2a7431e458..6e689208def2 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -65,6 +65,8 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
                                goto out;
                        goto rotate;
                case -ENOMEDIUM:
+                case -EKEYREJECTED:
+                case -EKEYEXPIRED:
                        goto out;
                default:
                        ret = -EIO;
diff --git a/fs/aio.c b/fs/aio.c
index 76da12537956..d065b2c3273e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -485,6 +485,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
        assert_spin_locked(&ctx->ctx_lock);
+        if (req->ki_eventfd != NULL)
+                eventfd_ctx_put(req->ki_eventfd);
        if (req->ki_dtor)
                req->ki_dtor(req);
        if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work_struct *data)
                /* Complete the fput(s) */
                if (req->ki_filp != NULL)
                        __fput(req->ki_filp);
-                if (req->ki_eventfd != NULL)
-                        __fput(req->ki_eventfd);
                /* Link the iocb into the context's free list */
                spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work_struct *data)
 */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-        int schedule_putreq = 0;
        dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
                req, atomic_long_read(&req->ki_filp->f_count));
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
         * we would not be holding the last reference to the file*, so
         * this function will be executed w/out any aio kthread wakeup.
         */
-        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
+        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
-                schedule_putreq++;
-        else
-                req->ki_filp = NULL;
-        if (req->ki_eventfd != NULL) {
-                if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-                        schedule_putreq++;
-                else
-                        req->ki_eventfd = NULL;
-        }
-        if (unlikely(schedule_putreq)) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
                spin_unlock(&fput_lock);
                queue_work(aio_wq, &fput_work);
-        } else
+        } else {
+                req->ki_filp = NULL;
                really_put_req(ctx, req);
+        }
        return 1;
 }
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                 * an eventfd() fd, and will be signaled for each completed
                 * event using the eventfd_signal() function.
                 */
-                req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+                req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
                if (IS_ERR(req->ki_eventfd)) {
                        ret = PTR_ERR(req->ki_eventfd);
                        req->ki_eventfd = NULL;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 1dd96d4406c0..47d4a01c5393 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -52,6 +52,19 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_delete       = anon_inodefs_delete_dentry,
 };
+/*
+ * nop .set_page_dirty method so that people can use .page_mkwrite on
+ * anon inodes.
+ */
+static int anon_set_page_dirty(struct page *page)
+{
+        return 0;
+};
+static const struct address_space_operations anon_aops = {
+        .set_page_dirty = anon_set_page_dirty,
+};
 /**
 * anon_inode_getfd - creates a new file instance by hooking it up to an
 *                    anonymous inode, and a dentry that describe the "class"
@@ -151,6 +164,8 @@ static struct inode *anon_inode_mkinode(void)
        inode->i_fop = &anon_inode_fops;
+        inode->i_mapping->a_ops = &anon_aops;
        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index f3da2eb51f56..00bf8fcb245f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -19,7 +19,6 @@
 #include <linux/sched.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
-#include <linux/smp_lock.h>
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/uaccess.h>
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9367b6297d84..615d5496fe0f 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
 {
        struct nls_table *nls = BEFS_SB(sb)->nls;
        int i, o;
-        wchar_t uni;
+        unicode_t uni;
        int unilen, utflen;
        char *result;
        /* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
        for (i = o = 0; i < in_len; i += utflen, o += unilen) {
                /* convert from UTF-8 to Unicode */
-                utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
+                utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
-                if (utflen < 0) {
+                if (utflen < 0)
                        goto conv_err;
-                }
                /* convert from Unicode to nls */
+                if (uni > MAX_WCHAR_T)
+                        goto conv_err;
                unilen = nls->uni2char(uni, &result[o], in_len - o);
-                if (unilen < 0) {
+                if (unilen < 0)
                        goto conv_err;
-                }
        }
        result[o] = '\0';
        *out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
                /* convert from nls to unicode */
                unilen = nls->char2uni(&in[i], in_len - i, &uni);
-                if (unilen < 0) {
+                if (unilen < 0)
                        goto conv_err;
-                }
                /* convert from unicode to UTF-8 */
-                utflen = utf8_wctomb(&result[o], uni, 3);
+                utflen = utf32_to_utf8(uni, &result[o], 3);
-                if (utflen <= 0) {
+                if (utflen <= 0)
                        goto conv_err;
-                }
        }
        result[o] = '\0';
@@ -737,8 +735,6 @@ parse_options(char *options, befs_mount_options * opts)
 static void
 befs_put_super(struct super_block *sb)
 {
-        lock_kernel();
        kfree(BEFS_SB(sb)->mount_opts.iocharset);
        BEFS_SB(sb)->mount_opts.iocharset = NULL;
@@ -749,8 +745,6 @@ befs_put_super(struct super_block *sb)
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 /* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 54bd07d44e68..1e41aadb1068 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -8,7 +8,6 @@
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include "bfs.h"
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 6a021265f018..88b9a3ff44e4 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -11,7 +11,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "bfs.h"
 #undef DEBUG
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 40381df34869..b7c1603cd4bd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1340,8 +1340,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
        prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
        prstatus->pr_sigpend = p->pending.signal.sig[0];
        prstatus->pr_sighold = p->blocked.sig[0];
+        rcu_read_lock();
+        prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        prstatus->pr_pid = task_pid_vnr(p);
-        prstatus->pr_ppid = task_pid_vnr(p->real_parent);
        prstatus->pr_pgrp = task_pgrp_vnr(p);
        prstatus->pr_sid = task_session_vnr(p);
        if (thread_group_leader(p)) {
@@ -1382,8 +1384,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
                        psinfo->pr_psargs[i] = ' ';
        psinfo->pr_psargs[len] = 0;
+        rcu_read_lock();
+        psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        psinfo->pr_pid = task_pid_vnr(p);
-        psinfo->pr_ppid = task_pid_vnr(p->real_parent);
        psinfo->pr_pgrp = task_pgrp_vnr(p);
        psinfo->pr_sid = task_session_vnr(p);
@@ -1518,11 +1522,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        info->thread = NULL;
        psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
        if (psinfo == NULL)
                return 0;
+        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
        /*
         * Figure out how many notes we're going to need for each thread.
         */
@@ -1925,7 +1929,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        elf = kmalloc(sizeof(*elf), GFP_KERNEL);
        if (!elf)
                goto out;
-        
+        /*
+         * The number of segs are recored into ELF header as 16bit value.
+         * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
+         */
        segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
        segs += ELF_CORE_EXTRA_PHDRS;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fdb66faa24f1..20fbeced472b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1387,8 +1387,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
        prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
        prstatus->pr_sigpend = p->pending.signal.sig[0];
        prstatus->pr_sighold = p->blocked.sig[0];
+        rcu_read_lock();
+        prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        prstatus->pr_pid = task_pid_vnr(p);
-        prstatus->pr_ppid = task_pid_vnr(p->real_parent);
        prstatus->pr_pgrp = task_pgrp_vnr(p);
        prstatus->pr_sid = task_session_vnr(p);
        if (thread_group_leader(p)) {
@@ -1432,8 +1434,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
                        psinfo->pr_psargs[i] = ' ';
        psinfo->pr_psargs[len] = 0;
+        rcu_read_lock();
+        psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        psinfo->pr_pid = task_pid_vnr(p);
-        psinfo->pr_ppid = task_pid_vnr(p->real_parent);
        psinfo->pr_pgrp = task_pgrp_vnr(p);
        psinfo->pr_sid = task_session_vnr(p);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f1313..e92f229e3c6e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        if (IS_ERR(bprm.file))
                return res;
+        bprm.cred = prepare_exec_creds();
+        res = -ENOMEM;
+        if (!bprm.cred)
+                goto out;
        res = prepare_binprm(&bprm);
        if (res <= (unsigned long)-4096)
                res = load_flat_file(&bprm, libs, id, NULL);
-        if (bprm.file) {
-                allow_write_access(bprm.file);
+        abort_creds(bprm.cred);
-                fput(bprm.file);
-                bprm.file = NULL;
+out:
-        }
+        allow_write_access(bprm.file);
+        fput(bprm.file);
        return(res);
 }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 31c46a241bac..49a34e7f7306 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -1,7 +1,7 @@
 /*
 * bio-integrity.c - bio data integrity extensions
 *
- * Copyright (C) 2007, 2008 Oracle Corporation
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 *
 * This program is free software; you can redistribute it and/or
@@ -25,63 +25,121 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
-static struct kmem_cache *bio_integrity_slab __read_mostly;
+struct integrity_slab {
-static mempool_t *bio_integrity_pool;
+        struct kmem_cache *slab;
-static struct bio_set *integrity_bio_set;
+        unsigned short nr_vecs;
+        char name[8];
+};
+#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
+struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
+        IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
+};
+#undef IS
 static struct workqueue_struct *kintegrityd_wq;
+static inline unsigned int vecs_to_idx(unsigned int nr)
+{
+        switch (nr) {
+        case 1:
+                return 0;
+        case 2 ... 4:
+                return 1;
+        case 5 ... 16:
+                return 2;
+        case 17 ... 64:
+                return 3;
+        case 65 ... 128:
+                return 4;
+        case 129 ... BIO_MAX_PAGES:
+                return 5;
+        default:
+                BUG();
+        }
+}
+static inline int use_bip_pool(unsigned int idx)
+{
+        if (idx == BIOVEC_NR_POOLS)
+                return 1;
+        return 0;
+}
 /**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
 * @bio:        bio to attach integrity metadata to
 * @gfp_mask:   Memory allocation mask
 * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ * @bs:         bio_set to allocate from
 *
 * Description: This function prepares a bio for attaching integrity
 * metadata.  nr_vecs specifies the maximum number of pages containing
 * integrity metadata that can be attached.
 */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
-                                                  gfp_t gfp_mask,
+                                                         gfp_t gfp_mask,
-                                                  unsigned int nr_vecs)
+                                                         unsigned int nr_vecs,
+                                                         struct bio_set *bs)
 {
        struct bio_integrity_payload *bip;
-        struct bio_vec *iv;
+        unsigned int idx = vecs_to_idx(nr_vecs);
-        unsigned long idx;
        BUG_ON(bio == NULL);
+        bip = NULL;
-        bip = mempool_alloc(bio_integrity_pool, gfp_mask);
+        /* Lower order allocations come straight from slab */
-        if (unlikely(bip == NULL)) {
+        if (!use_bip_pool(idx))
-                printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+                bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-                return NULL;
-        }
-        memset(bip, 0, sizeof(*bip));
+        /* Use mempool if lower order alloc failed or max vecs were requested */
+        if (bip == NULL) {
+                bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-        iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
+                if (unlikely(bip == NULL)) {
-        if (unlikely(iv == NULL)) {
+                        printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-                printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+                        return NULL;
-                mempool_free(bip, bio_integrity_pool);
+                }
-                return NULL;
        }
-        bip->bip_pool = idx;
+        memset(bip, 0, sizeof(*bip));
-        bip->bip_vec = iv;
+        bip->bip_slab = idx;
        bip->bip_bio = bio;
        bio->bi_integrity = bip;
        return bip;
 }
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+                                                  gfp_t gfp_mask,
+                                                  unsigned int nr_vecs)
+{
+        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
 EXPORT_SYMBOL(bio_integrity_alloc);
 /**
 * bio_integrity_free - Free bio integrity payload
 * @bio:        bio containing bip to be freed
+ * @bs:         bio_set this bio was allocated from
 *
 * Description: Used to free the integrity portion of a bio. Usually
 * called from bio_free().
 */
-void bio_integrity_free(struct bio *bio)
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 {
        struct bio_integrity_payload *bip = bio->bi_integrity;
@@ -92,8 +150,10 @@ void bio_integrity_free(struct bio *bio)
            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
-        bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
+        if (use_bip_pool(bip->bip_slab))
-        mempool_free(bip, bio_integrity_pool);
+                mempool_free(bip, bs->bio_integrity_pool);
+        else
+                kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
        bio->bi_integrity = NULL;
 }
@@ -114,7 +174,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct bio_vec *iv;
-        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
                printk(KERN_ERR "%s: bip_vec full\n", __func__);
                return 0;
        }
@@ -647,8 +707,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
        bp->iv1 = bip->bip_vec[0];
        bp->iv2 = bip->bip_vec[0];
-        bp->bip1.bip_vec = &bp->iv1;
+        bp->bip1.bip_vec[0] = bp->iv1;
-        bp->bip2.bip_vec = &bp->iv2;
+        bp->bip2.bip_vec[0] = bp->iv2;
        bp->iv1.bv_len = sectors * bi->tuple_size;
        bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -667,17 +727,19 @@ EXPORT_SYMBOL(bio_integrity_split);
 * @bio:        New bio
 * @bio_src:    Original bio
 * @gfp_mask:   Memory allocation mask
+ * @bs:         bio_set to allocate bip from
 *
 * Description: Called to allocate a bip when cloning a bio
 */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+                        gfp_t gfp_mask, struct bio_set *bs)
 {
        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
        struct bio_integrity_payload *bip;
        BUG_ON(bip_src == NULL);
-        bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
+        bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
        if (bip == NULL)
                return -EIO;
@@ -693,25 +755,43 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_integrity_clone);
-static int __init bio_integrity_init(void)
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-        kintegrityd_wq = create_workqueue("kintegrityd");
+        unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
+        bs->bio_integrity_pool =
+                mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+        if (!bs->bio_integrity_pool)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+void bioset_integrity_free(struct bio_set *bs)
+{
+        if (bs->bio_integrity_pool)
+                mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+void __init bio_integrity_init(void)
+{
+        unsigned int i;
+        kintegrityd_wq = create_workqueue("kintegrityd");
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");
-        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+        for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+                unsigned int size;
-        bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
+                size = sizeof(struct bio_integrity_payload)
-                                                      bio_integrity_slab);
+                        + bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-        if (!bio_integrity_pool)
-                panic("bio_integrity: can't allocate bip pool\n");
-        integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+                bip_slab[i].slab =
-        if (!integrity_bio_set)
+                        kmem_cache_create(bip_slab[i].name, size, 0,
-                panic("bio_integrity: can't allocate bio_set\n");
+                                          SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+        }
-        return 0;
 }
-subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 59000215e59b..76738005c8e8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,7 +25,6 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
 #include <trace/events/block.h>
@@ -239,7 +238,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
                bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
        if (bio_integrity(bio))
-                bio_integrity_free(bio);
+                bio_integrity_free(bio, bs);
        /*
         * If we have front padding, adjust the bio pointer before freeing
@@ -342,7 +341,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 static void bio_kmalloc_destructor(struct bio *bio)
 {
        if (bio_integrity(bio))
-                bio_integrity_free(bio);
+                bio_integrity_free(bio, fs_bio_set);
        kfree(bio);
 }
@@ -358,9 +357,9 @@ static void bio_kmalloc_destructor(struct bio *bio)
 *
 *   If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
 *   a bio. This is due to the mempool guarantees. To make this work, callers
- *   must never allocate more than 1 bio at the time from this pool. Callers
+ *   must never allocate more than 1 bio at a time from this pool. Callers
 *   that need to allocate more than 1 bio must always submit the previously
- *   allocate bio for IO before attempting to allocate a new one. Failure to
+ *   allocated bio for IO before attempting to allocate a new one. Failure to
 *   do so can cause livelocks under memory pressure.
 *
 **/
@@ -473,7 +472,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
        if (bio_integrity(bio)) {
                int ret;
-                ret = bio_integrity_clone(b, bio, gfp_mask);
+                ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
                if (ret < 0) {
                        bio_put(b);
@@ -706,14 +705,13 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
 }
 static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-                          struct sg_iovec *iov, int iov_count, int uncopy,
+                          struct sg_iovec *iov, int iov_count,
-                          int do_free_page)
+                          int to_user, int from_user, int do_free_page)
 {
        int ret = 0, i;
        struct bio_vec *bvec;
        int iov_idx = 0;
        unsigned int iov_off = 0;
-        int read = bio_data_dir(bio) == READ;
        __bio_for_each_segment(bvec, bio, i, 0) {
                char *bv_addr = page_address(bvec->bv_page);
@@ -728,13 +726,14 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
                        iov_addr = iov[iov_idx].iov_base + iov_off;
                        if (!ret) {
-                                if (!read && !uncopy)
+                                if (to_user)
-                                        ret = copy_from_user(bv_addr, iov_addr,
-                                                             bytes);
-                                if (read && uncopy)
                                        ret = copy_to_user(iov_addr, bv_addr,
                                                           bytes);
+                                if (from_user)
+                                        ret = copy_from_user(bv_addr, iov_addr,
+                                                             bytes);
                                if (ret)
                                        ret = -EFAULT;
                        }
@@ -771,7 +770,8 @@ int bio_uncopy_user(struct bio *bio)
        if (!bio_flagged(bio, BIO_NULL_MAPPED))
                ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
-                                     bmd->nr_sgvecs, 1, bmd->is_our_pages);
+                                     bmd->nr_sgvecs, bio_data_dir(bio) == READ,
+                                     0, bmd->is_our_pages);
        bio_free_map_data(bmd);
        bio_put(bio);
        return ret;
@@ -876,8 +876,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        /*
         * success
         */
-        if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
+        if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
-                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
+            (map_data && map_data->from_user)) {
+                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
                if (ret)
                        goto cleanup;
        }
@@ -1540,6 +1541,7 @@ void bioset_free(struct bio_set *bs)
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
+        bioset_integrity_free(bs);
        biovec_free_pools(bs);
        bio_put_slab(bs);
@@ -1580,6 +1582,9 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
+        if (bioset_integrity_create(bs, pool_size))
+                goto bad;
        if (!biovec_create_pools(bs, pool_size))
                return bs;
@@ -1617,6 +1622,7 @@ static int __init init_bio(void)
        if (!bio_slabs)
                panic("bio: can't allocate bios\n");
+        bio_integrity_init();
        biovec_init_slabs();
        fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3a6d4fb2a329..94dfda24c06e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -564,6 +564,16 @@ struct block_device *bdget(dev_t dev)
 EXPORT_SYMBOL(bdget);
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:       Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+        atomic_inc(&bdev->bd_inode->i_count);
+        return bdev;
+}
 long nr_blockdev_pages(void)
 {
        struct block_device *bdev;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 603972576f0f..f128427b995b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -29,51 +29,28 @@
 #ifdef CONFIG_FS_POSIX_ACL
-static void btrfs_update_cached_acl(struct inode *inode,
-                                    struct posix_acl **p_acl,
-                                    struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
-                posix_acl_release(*p_acl);
-        *p_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
        int size;
        const char *name;
        char *value = NULL;
-        struct posix_acl *acl = NULL, **p_acl;
+        struct posix_acl *acl;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &BTRFS_I(inode)->i_acl;
                break;
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &BTRFS_I(inode)->i_default_acl;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
-        /* Handle the cached NULL acl case without locking */
-        acl = ACCESS_ONCE(*p_acl);
-        if (!acl)
-                return acl;
-        spin_lock(&inode->i_lock);
-        acl = *p_acl;
-        if (acl != BTRFS_ACL_NOT_CACHED)
-                acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-        if (acl != BTRFS_ACL_NOT_CACHED)
-                return acl;
        size = __btrfs_getxattr(inode, name, "", 0);
        if (size > 0) {
                value = kzalloc(size, GFP_NOFS);
@@ -82,13 +59,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
-                        btrfs_update_cached_acl(inode, p_acl, acl);
+                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
        } else if (size == -ENOENT || size == -ENODATA || size == 0) {
                /* FIXME, who returns -ENOENT?  I think nobody */
                acl = NULL;
-                btrfs_update_cached_acl(inode, p_acl, acl);
+                set_cached_acl(inode, type, acl);
        } else {
                acl = ERR_PTR(-EIO);
        }
@@ -121,7 +98,6 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
        int ret, size = 0;
        const char *name;
-        struct posix_acl **p_acl;
        char *value = NULL;
        mode_t mode;
@@ -141,13 +117,11 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
                ret = 0;
                inode->i_mode = mode;
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &BTRFS_I(inode)->i_acl;
                break;
        case ACL_TYPE_DEFAULT:
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EINVAL : 0;
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &BTRFS_I(inode)->i_default_acl;
                break;
        default:
                return -EINVAL;
@@ -172,7 +146,7 @@ out:
        kfree(value);
        if (!ret)
-                btrfs_update_cached_acl(inode, p_acl, acl);
+                set_cached_acl(inode, type, acl);
        return ret;
 }
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7f88628a1a72..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -299,8 +299,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                                           "btrfs-%s-%d", workers->name,
                                           workers->num_workers + i);
                if (IS_ERR(worker->task)) {
-                        kfree(worker);
                        ret = PTR_ERR(worker->task);
+                        kfree(worker);
                        goto fail;
                }
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
         * list
         */
        if (worker->idle) {
-                spin_lock_irqsave(&worker->workers->lock, flags);
+                spin_lock(&worker->workers->lock);
                worker->idle = 0;
                list_move_tail(&worker->worker_list,
                               &worker->workers->worker_list);
-                spin_unlock_irqrestore(&worker->workers->lock, flags);
+                spin_unlock(&worker->workers->lock);
        }
        if (!worker->working) {
                wake = 1;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index acb4f3517582..ea1ea0af8c0e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -53,10 +53,6 @@ struct btrfs_inode {
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
-        /* standard acl pointers */
-        struct posix_acl *i_acl;
-        struct posix_acl *i_default_acl;
        /* for keeping track of orphaned inodes */
        struct list_head i_orphan;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index de1e2fd32080..9d8ba4d54a37 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
 #include <linux/swap.h>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
        btrfs_disk_key_to_cpu(&k1, disk);
-        if (k1.objectid > k2->objectid)
+        return btrfs_comp_cpu_keys(&k1, k2);
-                return 1;
-        if (k1.objectid < k2->objectid)
-                return -1;
-        if (k1.type > k2->type)
-                return 1;
-        if (k1.type < k2->type)
-                return -1;
-        if (k1.offset > k2->offset)
-                return 1;
-        if (k1.offset < k2->offset)
-                return -1;
-        return 0;
 }
 /*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (btrfs_header_nritems(mid) > 2)
-                return 0;
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct extent_buffer *b;
        int slot;
        int ret;
+        int err;
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
                        p->locks[level] = 1;
                if (cow) {
-                        int wret;
                        /*
                         * if we don't really need to cow this block
                         * then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
                        btrfs_set_path_blocking(p);
-                        wret = btrfs_cow_block(trans, root, b,
+                        err = btrfs_cow_block(trans, root, b,
-                                               p->nodes[level + 1],
+                                              p->nodes[level + 1],
-                                               p->slots[level + 1], &b);
+                                              p->slots[level + 1], &b);
-                        if (wret) {
+                        if (err) {
                                free_extent_buffer(b);
-                                ret = wret;
+                                ret = err;
                                goto done;
                        }
                }
@@ -1793,41 +1777,45 @@ cow_done:
                ret = bin_search(b, key, level, &slot);
                if (level != 0) {
-                        if (ret && slot > 0)
+                        int dec = 0;
+                        if (ret && slot > 0) {
+                                dec = 1;
                                slot -= 1;
+                        }
                        p->slots[level] = slot;
-                        ret = setup_nodes_for_search(trans, root, p, b, level,
+                        err = setup_nodes_for_search(trans, root, p, b, level,
                                                     ins_len);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
-                        else if (ret)
+                        if (err) {
+                                ret = err;
                                goto done;
+                        }
                        b = p->nodes[level];
                        slot = p->slots[level];
                        unlock_up(p, level, lowest_unlock);
-                        /* this is only true while dropping a snapshot */
                        if (level == lowest_level) {
-                                ret = 0;
+                                if (dec)
+                                        p->slots[level]++;
                                goto done;
                        }
-                        ret = read_block_for_search(trans, root, p,
+                        err = read_block_for_search(trans, root, p,
                                                    &b, level, slot, key);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
+                        if (err) {
-                        if (ret == -EIO)
+                                ret = err;
                                goto done;
+                        }
                        if (!p->skip_locking) {
-                                int lret;
                                btrfs_clear_path_blocking(p, NULL);
-                                lret = btrfs_try_spin_lock(b);
+                                err = btrfs_try_spin_lock(b);
-                                if (!lret) {
+                                if (!err) {
                                        btrfs_set_path_blocking(p);
                                        btrfs_tree_lock(b);
                                        btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
                        p->slots[level] = slot;
                        if (ins_len > 0 &&
                            btrfs_leaf_free_space(root, b) < ins_len) {
-                                int sret;
                                btrfs_set_path_blocking(p);
-                                sret = split_leaf(trans, root, key,
+                                err = split_leaf(trans, root, key,
-                                                      p, ins_len, ret == 0);
+                                                 p, ins_len, ret == 0);
                                btrfs_clear_path_blocking(p, NULL);
-                                BUG_ON(sret > 0);
+                                BUG_ON(err > 0);
-                                if (sret) {
+                                if (err) {
-                                        ret = sret;
+                                        ret = err;
                                        goto done;
                                }
                        }
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
 * calling this function.
 */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *key, int lowest_level,
+                        struct btrfs_key *key, int level,
                        int cache_only, u64 min_trans)
 {
-        int level = lowest_level;
        int slot;
        struct extent_buffer *c;
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                c = path->nodes[level];
 next:
                if (slot >= btrfs_header_nritems(c)) {
-                        level++;
+                        int ret;
-                        if (level == BTRFS_MAX_LEVEL)
+                        int orig_lowest;
+                        struct btrfs_key cur_key;
+                        if (level + 1 >= BTRFS_MAX_LEVEL ||
+                            !path->nodes[level + 1])
                                return 1;
-                        continue;
+                        if (path->locks[level + 1]) {
+                                level++;
+                                continue;
+                        }
+                        slot = btrfs_header_nritems(c) - 1;
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(c, &cur_key, slot);
+                        else
+                                btrfs_node_key_to_cpu(c, &cur_key, slot);
+                        orig_lowest = path->lowest_level;
+                        btrfs_release_path(root, path);
+                        path->lowest_level = level;
+                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
+                                                0, 0);
+                        path->lowest_level = orig_lowest;
+                        if (ret < 0)
+                                return ret;
+                        c = path->nodes[level];
+                        slot = path->slots[level];
+                        if (ret == 0)
+                                slot++;
+                        goto next;
                }
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
@@ -4146,7 +4160,8 @@ again:
         * advance the path if there are now more items available.
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
-                path->slots[0]++;
+                if (ret == 0)
+                        path->slots[0]++;
                ret = 0;
                goto done;
        }
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
                        path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.type == type)
-                        return 0;
                if (found_key.objectid < min_objectid)
                        break;
+                if (found_key.type == type)
+                        return 0;
                if (found_key.objectid == min_objectid &&
                    found_key.type < type)
                        break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03441a99ea38..837435ce84ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,8 +41,6 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAGIC "_BHRfS_M"
-#define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 #define BTRFS_MAX_LEVEL 8
 #define BTRFS_COMPAT_EXTENT_TREE_V0
@@ -483,7 +481,7 @@ struct btrfs_shared_data_ref {
 struct btrfs_extent_inline_ref {
        u8 type;
-        u64 offset;
+        __le64 offset;
 } __attribute__ ((__packed__));
 /* old style backrefs item */
@@ -691,6 +689,7 @@ struct btrfs_space_info {
        struct list_head block_groups;
        spinlock_t lock;
        struct rw_semaphore groups_sem;
+        atomic_t caching_threads;
 };
 /*
@@ -709,6 +708,9 @@ struct btrfs_free_cluster {
        /* first extent starting offset */
        u64 window_start;
+        /* if this cluster simply points at a bitmap in the block group */
+        bool points_to_bitmap;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@ -718,24 +720,37 @@ struct btrfs_free_cluster {
        struct list_head block_group_list;
 };
+enum btrfs_caching_type {
+        BTRFS_CACHE_NO          = 0,
+        BTRFS_CACHE_STARTED     = 1,
+        BTRFS_CACHE_FINISHED    = 2,
+};
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
+        struct btrfs_fs_info *fs_info;
        spinlock_t lock;
-        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
        u64 flags;
-        int cached;
+        u64 sectorsize;
+        int extents_thresh;
+        int free_extents;
+        int total_bitmaps;
        int ro;
        int dirty;
+        /* cache tracking stuff */
+        wait_queue_head_t caching_q;
+        int cached;
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
        spinlock_t tree_lock;
-        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
+        u64 free_space;
        /* block group cache stuff */
        struct rb_node cache_node;
@@ -810,6 +825,7 @@ struct btrfs_fs_info {
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+        struct rw_semaphore extent_commit_sem;
        /*
         * this protects the ordered operations list only while we are
@@ -1990,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
                                 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2076,8 +2093,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
-                        *root);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d50d49d990a..e83be2e4602c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,8 @@
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
+static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
 * complete.  This is used during reads to verify checksums, and it is used
@@ -1342,12 +1344,25 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
        free_extent_map(em);
 }
+/*
+ * If this fails, caller must call bdi_destroy() to get rid of the
+ * bdi again.
+ */
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
-        bdi_init(bdi);
+        int err;
+        bdi->capabilities = BDI_CAP_MAP_COPY;
+        err = bdi_init(bdi);
+        if (err)
+                return err;
+        err = bdi_register(bdi, NULL, "btrfs-%d",
+                                atomic_inc_return(&btrfs_bdi_num));
+        if (err)
+                return err;
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
-        bdi->state              = 0;
-        bdi->capabilities       = default_backing_dev_info.capabilities;
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
        bdi->unplug_io_data     = info;
        bdi->congested_fn       = btrfs_congested_fn;
@@ -1569,7 +1584,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->sb = sb;
        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
-        setup_bdi(fs_info, &fs_info->bdi);
+        if (setup_bdi(fs_info, &fs_info->bdi))
+                goto fail_bdi;
        fs_info->btree_inode = new_inode(sb);
        fs_info->btree_inode->i_ino = 1;
        fs_info->btree_inode->i_nlink = 1;
@@ -1623,6 +1639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+        init_rwsem(&fs_info->extent_commit_sem);
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1783,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+                       sb->s_id);
+                goto fail_chunk_root;
+        }
        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
        chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1810,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+                       sb->s_id);
+                goto fail_tree_root;
+        }
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
@@ -1946,8 +1973,8 @@ fail_iput:
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+fail_bdi:
        bdi_destroy(&fs_info->bdi);
 fail:
        kfree(extent_root);
        kfree(tree_root);
@@ -2306,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        fs_info->closing = 2;
+        smp_mb();
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
@@ -2327,6 +2357,7 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(root->fs_info->csum_root->commit_root);
        btrfs_free_block_groups(root->fs_info);
+        btrfs_free_pinned_extents(root->fs_info);
        del_fs_roots(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index edc7d208c5ce..72a2b9c28e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+        smp_mb();
+        return cache->cached == BTRFS_CACHE_FINISHED;
+}
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
        return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 }
 /*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+{
+        u64 start, end, last = 0;
+        int ret;
+        while (1) {
+                ret = find_first_extent_bit(&info->pinned_extents, last,
+                                            &start, &end,
+                                            EXTENT_LOCKED|EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_bits(&info->pinned_extents, start, end,
+                                  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+                last = end+1;
+        }
+}
+static int remove_sb_from_cache(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 bytenr;
+        u64 *logical;
+        int stripe_len;
+        int i, nr, ret;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                       cache->key.objectid, bytenr,
+                                       0, &logical, &nr, &stripe_len);
+                BUG_ON(ret);
+                while (nr--) {
+                        try_lock_extent(&fs_info->pinned_extents,
+                                        logical[nr],
+                                        logical[nr] + stripe_len - 1, GFP_NOFS);
+                }
+                kfree(logical);
+        }
+        return 0;
+}
+/*
 * this is only called by cache_block_group, since we could have freed extents
 * we need to check the pinned_extents for any extents that can't be used yet
 * since their free space will be released as soon as the transaction commits.
 */
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_fs_info *info, u64 start, u64 end)
 {
-        u64 extent_start, extent_end, size;
+        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY|EXTENT_LOCKED);
                if (ret)
                        break;
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
+                        total_added += size;
                        ret = btrfs_add_free_space(block_group, start,
                                                   size);
                        BUG_ON(ret);
@@ -178,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        if (start < end) {
                size = end - start;
+                total_added += size;
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-        return 0;
+        return total_added;
 }
-static int remove_sb_from_cache(struct btrfs_root *root,
+static int caching_kthread(void *data)
-                                struct btrfs_block_group_cache *cache)
-{
-        u64 bytenr;
-        u64 *logical;
-        int stripe_len;
-        int i, nr, ret;
-        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-                bytenr = btrfs_sb_offset(i);
-                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
-                                       cache->key.objectid, bytenr, 0,
-                                       &logical, &nr, &stripe_len);
-                BUG_ON(ret);
-                while (nr--) {
-                        btrfs_remove_free_space(cache, logical[nr],
-                                                stripe_len);
-                }
-                kfree(logical);
-        }
-        return 0;
-}
-static int cache_block_group(struct btrfs_root *root,
-                             struct btrfs_block_group_cache *block_group)
 {
+        struct btrfs_block_group_cache *block_group = data;
+        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        u64 last = 0;
        struct btrfs_path *path;
        int ret = 0;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        int slot;
-        u64 last;
+        u64 total_found = 0;
-        if (!block_group)
+        BUG_ON(!fs_info);
-                return 0;
-        root = root->fs_info->extent_root;
-        if (block_group->cached)
-                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
+        atomic_inc(&block_group->space_info->caching_threads);
+        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
        /*
-         * we get into deadlocks with paths held by callers of this function.
+         * We don't want to deadlock with somebody trying to allocate a new
-         * since the alloc_mutex is protecting things right now, just
+         * extent for the extent root while also trying to search the extent
-         * skip the locking here
+         * root to add free space.  So we skip locking and search the commit
+         * root, since its read-only
         */
        path->skip_locking = 1;
-        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+        path->search_commit_root = 1;
+        path->reada = 2;
        key.objectid = last;
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        down_read(&fs_info->extent_commit_sem);
+        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
        while (1) {
+                smp_mb();
+                if (block_group->fs_info->closing > 1) {
+                        last = (u64)-1;
+                        break;
+                }
                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
-                        ret = btrfs_next_leaf(root, path);
+                        ret = btrfs_next_leaf(fs_info->extent_root, path);
                        if (ret < 0)
                                goto err;
-                        if (ret == 0)
+                        else if (ret)
-                                continue;
-                        else
                                break;
+                        if (need_resched() ||
+                            btrfs_transaction_in_commit(fs_info)) {
+                                leaf = path->nodes[0];
+                                /* this shouldn't happen, but if the
+                                 * leaf is empty just move on.
+                                 */
+                                if (btrfs_header_nritems(leaf) == 0)
+                                        break;
+                                /*
+                                 * we need to copy the key out so that
+                                 * we are sure the next search advances
+                                 * us forward in the btree.
+                                 */
+                                btrfs_item_key_to_cpu(leaf, &key, 0);
+                                btrfs_release_path(fs_info->extent_root, path);
+                                up_read(&fs_info->extent_commit_sem);
+                                schedule_timeout(1);
+                                goto again;
+                        }
+                        continue;
                }
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid < block_group->key.objectid)
@@ -266,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root,
                        break;
                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-                        add_new_free_space(block_group, root->fs_info, last,
+                        total_found += add_new_free_space(block_group,
-                                           key.objectid);
+                                                          fs_info, last,
+                                                          key.objectid);
                        last = key.objectid + key.offset;
                }
+                if (total_found > (1024 * 1024 * 2)) {
+                        total_found = 0;
+                        wake_up(&block_group->caching_q);
+                }
 next:
                path->slots[0]++;
        }
+        ret = 0;
-        add_new_free_space(block_group, root->fs_info, last,
+        total_found += add_new_free_space(block_group, fs_info, last,
-                           block_group->key.objectid +
+                                          block_group->key.objectid +
-                           block_group->key.offset);
+                                          block_group->key.offset);
+        spin_lock(&block_group->lock);
+        block_group->cached = BTRFS_CACHE_FINISHED;
+        spin_unlock(&block_group->lock);
-        block_group->cached = 1;
-        remove_sb_from_cache(root, block_group);
-        ret = 0;
 err:
        btrfs_free_path(path);
+        up_read(&fs_info->extent_commit_sem);
+        atomic_dec(&block_group->space_info->caching_threads);
+        wake_up(&block_group->caching_q);
+        return 0;
+}
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+        struct task_struct *tsk;
+        int ret = 0;
+        spin_lock(&cache->lock);
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                return ret;
+        }
+        cache->cached = BTRFS_CACHE_STARTED;
+        spin_unlock(&cache->lock);
+        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+                          cache->key.objectid);
+        if (IS_ERR(tsk)) {
+                ret = PTR_ERR(tsk);
+                printk(KERN_ERR "error running thread %d\n", ret);
+                BUG();
+        }
        return ret;
 }
@@ -990,15 +1093,13 @@ static inline int extent_ref_type(u64 parent, u64 owner)
        return type;
 }
-static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+static int find_next_key(struct btrfs_path *path, int level,
+                         struct btrfs_key *key)
 {
-        int level;
+        for (; level < BTRFS_MAX_LEVEL; level++) {
-        BUG_ON(!path->keep_locks);
-        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                if (!path->nodes[level])
                        break;
-                btrfs_assert_tree_locked(path->nodes[level]);
                if (path->slots[level] + 1 >=
                    btrfs_header_nritems(path->nodes[level]))
                        continue;
@@ -1158,7 +1259,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
                 * For simplicity, we just do not add new inline back
                 * ref if there is any kind of item for this block
                 */
-                if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+                if (find_next_key(path, 0, &key) == 0 &&
+                    key.objectid == bytenr &&
                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
                        err = -EAGAIN;
                        goto out;
@@ -2388,13 +2490,29 @@ fail:
 }
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+                 struct btrfs_block_group_cache *cache)
+{
+        struct rb_node *node;
+        spin_lock(&root->fs_info->block_group_cache_lock);
+        node = rb_next(&cache->cache_node);
+        btrfs_put_block_group(cache);
+        if (node) {
+                cache = rb_entry(node, struct btrfs_block_group_cache,
+                                 cache_node);
+                atomic_inc(&cache->count);
+        } else
+                cache = NULL;
+        spin_unlock(&root->fs_info->block_group_cache_lock);
+        return cache;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        struct btrfs_block_group_cache *cache, *entry;
+        struct btrfs_block_group_cache *cache;
-        struct rb_node *n;
        int err = 0;
-        int werr = 0;
        struct btrfs_path *path;
        u64 last = 0;
@@ -2403,39 +2521,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        while (1) {
-                cache = NULL;
+                if (last == 0) {
-                spin_lock(&root->fs_info->block_group_cache_lock);
+                        err = btrfs_run_delayed_refs(trans, root,
-                for (n = rb_first(&root->fs_info->block_group_cache_tree);
+                                                     (unsigned long)-1);
-                     n; n = rb_next(n)) {
+                        BUG_ON(err);
-                        entry = rb_entry(n, struct btrfs_block_group_cache,
-                                         cache_node);
-                        if (entry->dirty) {
-                                cache = entry;
-                                break;
-                        }
                }
-                spin_unlock(&root->fs_info->block_group_cache_lock);
-                if (!cache)
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
-                        break;
+                while (cache) {
+                        if (cache->dirty)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
                cache->dirty = 0;
-                last += cache->key.offset;
+                last = cache->key.objectid + cache->key.offset;
-                err = write_one_cache_group(trans, root,
+                err = write_one_cache_group(trans, root, path, cache);
-                                            path, cache);
+                BUG_ON(err);
-                /*
+                btrfs_put_block_group(cache);
-                 * if we fail to write the cache group, we want
-                 * to keep it marked dirty in hopes that a later
-                 * write will work
-                 */
-                if (err) {
-                        werr = err;
-                        continue;
-                }
        }
        btrfs_free_path(path);
-        return werr;
+        return 0;
 }
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2485,6 +2599,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->force_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+        atomic_set(&found->caching_threads, 0);
        return 0;
 }
@@ -2697,7 +2812,7 @@ again:
                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
                       ", %llu bytes_used, %llu bytes_reserved, "
-                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
                       "%llu total\n", (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_delalloc,
                       (unsigned long long)data_sinfo->bytes_used,
@@ -2948,13 +3063,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        if (pin) {
+        if (pin)
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
-        } else {
-                clear_extent_dirty(&fs_info->pinned_extents,
-                                bytenr, bytenr + num - 1, GFP_NOFS);
-        }
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2970,14 +3081,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                        fs_info->total_pinned += len;
                } else {
+                        int unpin = 0;
+                        /*
+                         * in order to not race with the block group caching, we
+                         * only want to unpin the extent if we are cached.  If
+                         * we aren't cached, we want to start async caching this
+                         * block group so we can free the extent the next time
+                         * around.
+                         */
                        spin_lock(&cache->space_info->lock);
                        spin_lock(&cache->lock);
-                        cache->pinned -= len;
+                        unpin = (cache->cached == BTRFS_CACHE_FINISHED);
-                        cache->space_info->bytes_pinned -= len;
+                        if (likely(unpin)) {
+                                cache->pinned -= len;
+                                cache->space_info->bytes_pinned -= len;
+                                fs_info->total_pinned -= len;
+                        }
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        fs_info->total_pinned -= len;
-                        if (cache->cached)
+                        if (likely(unpin))
+                                clear_extent_dirty(&fs_info->pinned_extents,
+                                                   bytenr, bytenr + len -1,
+                                                   GFP_NOFS);
+                        else
+                                cache_block_group(cache);
+                        if (unpin)
                                btrfs_add_free_space(cache, bytenr, len);
                }
                btrfs_put_block_group(cache);
@@ -3031,6 +3162,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                                            &start, &end, EXTENT_DIRTY);
                if (ret)
                        break;
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
@@ -3059,6 +3191,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
        return ret;
 }
@@ -3437,6 +3570,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
 }
 /*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+        if (block_group_cache_done(cache)) {
+                finish_wait(&cache->caching_q, &wait);
+                return 0;
+        }
+        schedule();
+        finish_wait(&cache->caching_q, &wait);
+        wait_event(cache->caching_q, block_group_cache_done(cache) ||
+                   (cache->free_space >= num_bytes));
+        return 0;
+}
+enum btrfs_loop_type {
+        LOOP_CACHED_ONLY = 0,
+        LOOP_CACHING_NOWAIT = 1,
+        LOOP_CACHING_WAIT = 2,
+        LOOP_ALLOC_CHUNK = 3,
+        LOOP_NO_EMPTY_SIZE = 4,
+};
+/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
 * ins->objectid == block start
@@ -3461,6 +3633,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        bool found_uncached_bg = false;
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3492,15 +3665,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-        if (!last_ptr) {
+        if (!last_ptr)
                empty_cluster = 0;
-                loop = 1;
-        }
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-                if (block_group && block_group_bits(block_group, data)) {
+                /*
+                 * we don't want to use the block group if it doesn't match our
+                 * allocation bits, or if its not cached.
+                 */
+                if (block_group && block_group_bits(block_group, data) &&
+                    block_group_cache_done(block_group)) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -3523,21 +3699,35 @@ search:
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups, list) {
                u64 offset;
+                int cached;
                atomic_inc(&block_group->count);
                search_start = block_group->key.objectid;
 have_block_group:
-                if (unlikely(!block_group->cached)) {
+                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
-                        mutex_lock(&block_group->cache_mutex);
+                        /*
-                        ret = cache_block_group(root, block_group);
+                         * we want to start caching kthreads, but not too many
-                        mutex_unlock(&block_group->cache_mutex);
+                         * right off the bat so we don't overwhelm the system,
-                        if (ret) {
+                         * so only start them if there are less than 2 and we're
-                                btrfs_put_block_group(block_group);
+                         * in the initial allocation phase.
-                                break;
+                         */
+                        if (loop > LOOP_CACHING_NOWAIT ||
+                            atomic_read(&space_info->caching_threads) < 2) {
+                                ret = cache_block_group(block_group);
+                                BUG_ON(ret);
                        }
                }
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
+                        found_uncached_bg = true;
+                        /* if we only want cached bgs, loop */
+                        if (loop == LOOP_CACHED_ONLY)
+                                goto loop;
+                }
                if (unlikely(block_group->ro))
                        goto loop;
@@ -3616,14 +3806,21 @@ refill_cluster:
                                        spin_unlock(&last_ptr->refill_lock);
                                        goto checks;
                                }
+                        } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                wait_block_group_cache_progress(block_group,
+                                       num_bytes + empty_cluster + empty_size);
+                                goto have_block_group;
                        }
                        /*
                         * at this point we either didn't find a cluster
                         * or we weren't able to allocate a block from our
                         * cluster.  Free the cluster we've been trying
                         * to use, and go to the next block group
                         */
-                        if (loop < 2) {
+                        if (loop < LOOP_NO_EMPTY_SIZE) {
                                btrfs_return_cluster_to_free_space(NULL,
                                                                   last_ptr);
                                spin_unlock(&last_ptr->refill_lock);
@@ -3634,11 +3831,17 @@ refill_cluster:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
-                if (!offset)
+                if (!offset && (cached || (!cached &&
+                                           loop == LOOP_CACHING_NOWAIT))) {
                        goto loop;
+                } else if (!offset && (!cached &&
+                                       loop > LOOP_CACHING_NOWAIT)) {
+                        wait_block_group_cache_progress(block_group,
+                                        num_bytes + empty_size);
+                        goto have_block_group;
+                }
 checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3684,13 +3887,26 @@ loop:
        }
        up_read(&space_info->groups_sem);
-        /* loop == 0, try to find a clustered alloc in every block group
+        /* LOOP_CACHED_ONLY, only search fully cached block groups
-         * loop == 1, try again after forcing a chunk allocation
+         * LOOP_CACHING_NOWAIT, search partially cached block groups, but
-         * loop == 2, set empty_size and empty_cluster to 0 and try again
+         *                      dont wait foR them to finish caching
+         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+         *                      again
         */
-        if (!ins->objectid && loop < 3 &&
+        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
-            (empty_size || empty_cluster || allowed_chunk_alloc)) {
+            (found_uncached_bg || empty_size || empty_cluster ||
-                if (loop >= 2) {
+             allowed_chunk_alloc)) {
+                if (found_uncached_bg) {
+                        found_uncached_bg = false;
+                        if (loop < LOOP_CACHING_WAIT) {
+                                loop++;
+                                goto search;
+                        }
+                }
+                if (loop == LOOP_ALLOC_CHUNK) {
                        empty_size = 0;
                        empty_cluster = 0;
                }
@@ -3703,7 +3919,7 @@ loop:
                        space_info->force_alloc = 1;
                }
-                if (loop < 3) {
+                if (loop < LOOP_NO_EMPTY_SIZE) {
                        loop++;
                        goto search;
                }
@@ -3799,7 +4015,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret) {
+        if (ret == -ENOSPC) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -3807,7 +4023,6 @@ again:
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
                dump_space_info(sinfo, num_bytes);
-                BUG();
        }
        return ret;
@@ -3845,7 +4060,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
                                     empty_size, hint_byte, search_end, ins,
                                     data);
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        if (!ret)
+                update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -4007,9 +4224,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *block_group;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        mutex_lock(&block_group->cache_mutex);
+        cache_block_group(block_group);
-        cache_block_group(root, block_group);
+        wait_event(block_group->caching_q,
-        mutex_unlock(&block_group->cache_mutex);
+                   block_group_cache_done(block_group));
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
@@ -4040,7 +4257,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
                                     empty_size, hint_byte, search_end,
                                     ins, 0);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -4128,6 +4346,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        return buf;
 }
+#if 0
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf)
 {
@@ -4171,8 +4390,6 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
-#if 0
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_leaf_ref *ref)
@@ -4553,262 +4770,471 @@ out:
 }
 #endif
+struct walk_control {
+        u64 refs[BTRFS_MAX_LEVEL];
+        u64 flags[BTRFS_MAX_LEVEL];
+        struct btrfs_key update_progress;
+        int stage;
+        int level;
+        int shared_level;
+        int update_ref;
+        int keep_locks;
+};
+#define DROP_REFERENCE  1
+#define UPDATE_BACKREF  2
 /*
- * helper function for drop_subtree, this function is similar to
+ * hepler to process tree block while walking down the tree.
- * walk_down_tree. The main difference is that it checks reference
+ *
- * counts while tree blocks are locked.
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block. if the block is shared and
+ * we need update back refs for the subtree rooted at the
+ * block, this function changes wc->stage to UPDATE_BACKREF
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
 */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
-                                   struct btrfs_path *path, int *level)
+                                   struct btrfs_path *path,
+                                   struct walk_control *wc)
 {
-        struct extent_buffer *next;
+        int level = wc->level;
-        struct extent_buffer *cur;
+        struct extent_buffer *eb = path->nodes[level];
-        struct extent_buffer *parent;
+        struct btrfs_key key;
-        u64 bytenr;
+        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-        u64 ptr_gen;
-        u64 refs;
-        u64 flags;
-        u32 blocksize;
        int ret;
-        cur = path->nodes[*level];
+        if (wc->stage == UPDATE_BACKREF &&
-        ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
+            btrfs_header_owner(eb) != root->root_key.objectid)
-                                       &refs, &flags);
+                return 1;
-        BUG_ON(ret);
-        if (refs > 1)
-                goto out;
-        BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+        /*
+         * when reference count of tree block is 1, it won't increase
+         * again. once full backref flag is set, we never clear it.
+         */
+        if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+            (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+                BUG_ON(!path->locks[level]);
+                ret = btrfs_lookup_extent_info(trans, root,
+                                               eb->start, eb->len,
+                                               &wc->refs[level],
+                                               &wc->flags[level]);
+                BUG_ON(ret);
+                BUG_ON(wc->refs[level] == 0);
+        }
-        while (*level >= 0) {
+        if (wc->stage == DROP_REFERENCE &&
-                cur = path->nodes[*level];
+            wc->update_ref && wc->refs[level] > 1) {
-                if (*level == 0) {
+                BUG_ON(eb == root->node);
-                        ret = btrfs_drop_leaf_ref(trans, root, cur);
+                BUG_ON(path->slots[level] > 0);
-                        BUG_ON(ret);
+                if (level == 0)
-                        clean_tree_block(trans, root, cur);
+                        btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
-                        break;
+                else
-                }
+                        btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
-                if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+                if (btrfs_header_owner(eb) == root->root_key.objectid &&
-                        clean_tree_block(trans, root, cur);
+                    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
-                        break;
+                        wc->stage = UPDATE_BACKREF;
+                        wc->shared_level = level;
                }
+        }
-                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+        if (wc->stage == DROP_REFERENCE) {
-                blocksize = btrfs_level_size(root, *level - 1);
+                if (wc->refs[level] > 1)
-                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                        return 1;
-                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                if (path->locks[level] && !wc->keep_locks) {
-                btrfs_tree_lock(next);
+                        btrfs_tree_unlock(eb);
-                btrfs_set_lock_blocking(next);
+                        path->locks[level] = 0;
+                }
+                return 0;
+        }
-                ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+        /* wc->stage == UPDATE_BACKREF */
-                                               &refs, &flags);
+        if (!(wc->flags[level] & flag)) {
+                BUG_ON(!path->locks[level]);
+                ret = btrfs_inc_ref(trans, root, eb, 1);
                BUG_ON(ret);
-                if (refs > 1) {
+                ret = btrfs_dec_ref(trans, root, eb, 0);
-                        parent = path->nodes[*level];
+                BUG_ON(ret);
-                        ret = btrfs_free_extent(trans, root, bytenr,
+                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
-                                                blocksize, parent->start,
+                                                  eb->len, flag, 0);
-                                                btrfs_header_owner(parent),
+                BUG_ON(ret);
-                                                *level - 1, 0);
+                wc->flags[level] |= flag;
+        }
+        /*
+         * the block is shared by multiple trees, so it's not good to
+         * keep the tree lock
+         */
+        if (path->locks[level] && level > 0) {
+                btrfs_tree_unlock(eb);
+                path->locks[level] = 0;
+        }
+        return 0;
+}
+/*
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct walk_control *wc)
+{
+        int ret = 0;
+        int level = wc->level;
+        struct extent_buffer *eb = path->nodes[level];
+        u64 parent = 0;
+        if (wc->stage == UPDATE_BACKREF) {
+                BUG_ON(wc->shared_level < level);
+                if (level < wc->shared_level)
+                        goto out;
+                BUG_ON(wc->refs[level] <= 1);
+                ret = find_next_key(path, level + 1, &wc->update_progress);
+                if (ret > 0)
+                        wc->update_ref = 0;
+                wc->stage = DROP_REFERENCE;
+                wc->shared_level = -1;
+                path->slots[level] = 0;
+                /*
+                 * check reference count again if the block isn't locked.
+                 * we should start walking down the tree again if reference
+                 * count is one.
+                 */
+                if (!path->locks[level]) {
+                        BUG_ON(level == 0);
+                        btrfs_tree_lock(eb);
+                        btrfs_set_lock_blocking(eb);
+                        path->locks[level] = 1;
+                        ret = btrfs_lookup_extent_info(trans, root,
+                                                       eb->start, eb->len,
+                                                       &wc->refs[level],
+                                                       &wc->flags[level]);
                        BUG_ON(ret);
-                        path->slots[*level]++;
+                        BUG_ON(wc->refs[level] == 0);
-                        btrfs_tree_unlock(next);
+                        if (wc->refs[level] == 1) {
-                        free_extent_buffer(next);
+                                btrfs_tree_unlock(eb);
-                        continue;
+                                path->locks[level] = 0;
+                                return 1;
+                        }
+                } else {
+                        BUG_ON(level != 0);
                }
+        }
-                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+        /* wc->stage == DROP_REFERENCE */
+        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
-                *level = btrfs_header_level(next);
+        if (wc->refs[level] == 1) {
-                path->nodes[*level] = next;
+                if (level == 0) {
-                path->slots[*level] = 0;
+                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                path->locks[*level] = 1;
+                                ret = btrfs_dec_ref(trans, root, eb, 1);
-                cond_resched();
+                        else
+                                ret = btrfs_dec_ref(trans, root, eb, 0);
+                        BUG_ON(ret);
+                }
+                /* make block locked assertion in clean_tree_block happy */
+                if (!path->locks[level] &&
+                    btrfs_header_generation(eb) == trans->transid) {
+                        btrfs_tree_lock(eb);
+                        btrfs_set_lock_blocking(eb);
+                        path->locks[level] = 1;
+                }
+                clean_tree_block(trans, root, eb);
        }
-out:
-        if (path->nodes[*level] == root->node)
-                parent = path->nodes[*level];
-        else
-                parent = path->nodes[*level + 1];
-        bytenr = path->nodes[*level]->start;
-        blocksize = path->nodes[*level]->len;
-        ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
+        if (eb == root->node) {
-                                btrfs_header_owner(parent), *level, 0);
+                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                        parent = eb->start;
+                else
+                        BUG_ON(root->root_key.objectid !=
+                               btrfs_header_owner(eb));
+        } else {
+                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                        parent = path->nodes[level + 1]->start;
+                else
+                        BUG_ON(root->root_key.objectid !=
+                               btrfs_header_owner(path->nodes[level + 1]));
+        }
+        ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+                                root->root_key.objectid, level, 0);
        BUG_ON(ret);
+out:
+        wc->refs[level] = 0;
+        wc->flags[level] = 0;
+        return ret;
+}
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   struct walk_control *wc)
+{
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        u64 bytenr;
+        u64 ptr_gen;
+        u32 blocksize;
+        int level = wc->level;
+        int ret;
+        while (level >= 0) {
+                cur = path->nodes[level];
+                BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
+                ret = walk_down_proc(trans, root, path, wc);
+                if (ret > 0)
+                        break;
+                if (level == 0)
+                        break;
+                bytenr = btrfs_node_blockptr(cur, path->slots[level]);
+                blocksize = btrfs_level_size(root, level - 1);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
+                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                btrfs_tree_lock(next);
+                btrfs_set_lock_blocking(next);
-        if (path->locks[*level]) {
+                level--;
-                btrfs_tree_unlock(path->nodes[*level]);
+                BUG_ON(level != btrfs_header_level(next));
-                path->locks[*level] = 0;
+                path->nodes[level] = next;
+                path->slots[level] = 0;
+                path->locks[level] = 1;
+                wc->level = level;
        }
-        free_extent_buffer(path->nodes[*level]);
-        path->nodes[*level] = NULL;
-        *level += 1;
-        cond_resched();
        return 0;
 }
-/*
- * helper for dropping snapshots.  This walks back up the tree in the path
- * to find the first node higher up where we haven't yet gone through
- * all the slots
- */
 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
-                                 int *level, int max_level)
+                                 struct walk_control *wc, int max_level)
 {
-        struct btrfs_root_item *root_item = &root->root_item;
+        int level = wc->level;
-        int i;
-        int slot;
        int ret;
-        for (i = *level; i < max_level && path->nodes[i]; i++) {
+        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
-                slot = path->slots[i];
+        while (level < max_level && path->nodes[level]) {
-                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
+                wc->level = level;
-                        /*
+                if (path->slots[level] + 1 <
-                         * there is more work to do in this level.
+                    btrfs_header_nritems(path->nodes[level])) {
-                         * Update the drop_progress marker to reflect
+                        path->slots[level]++;
-                         * the work we've done so far, and then bump
-                         * the slot number
-                         */
-                        path->slots[i]++;
-                        WARN_ON(*level == 0);
-                        if (max_level == BTRFS_MAX_LEVEL) {
-                                btrfs_node_key(path->nodes[i],
-                                               &root_item->drop_progress,
-                                               path->slots[i]);
-                                root_item->drop_level = i;
-                        }
-                        *level = i;
                        return 0;
                } else {
-                        struct extent_buffer *parent;
+                        ret = walk_up_proc(trans, root, path, wc);
+                        if (ret > 0)
-                        /*
+                                return 0;
-                         * this whole node is done, free our reference
-                         * on it and go up one level
-                         */
-                        if (path->nodes[*level] == root->node)
-                                parent = path->nodes[*level];
-                        else
-                                parent = path->nodes[*level + 1];
-                        clean_tree_block(trans, root, path->nodes[i]);
+                        if (path->locks[level]) {
-                        ret = btrfs_free_extent(trans, root,
+                                btrfs_tree_unlock(path->nodes[level]);
-                                                path->nodes[i]->start,
+                                path->locks[level] = 0;
-                                                path->nodes[i]->len,
-                                                parent->start,
-                                                btrfs_header_owner(parent),
-                                                *level, 0);
-                        BUG_ON(ret);
-                        if (path->locks[*level]) {
-                                btrfs_tree_unlock(path->nodes[i]);
-                                path->locks[i] = 0;
                        }
-                        free_extent_buffer(path->nodes[i]);
+                        free_extent_buffer(path->nodes[level]);
-                        path->nodes[i] = NULL;
+                        path->nodes[level] = NULL;
-                        *level = i + 1;
+                        level++;
                }
        }
        return 1;
 }
 /*
- * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * drop a subvolume tree.
- * the tree freeing any blocks that have a ref count of zero after being
+ *
- * decremented.
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
-                        *root)
 {
-        int ret = 0;
-        int wret;
-        int level;
        struct btrfs_path *path;
-        int update_count;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
        struct btrfs_root_item *root_item = &root->root_item;
+        struct walk_control *wc;
+        struct btrfs_key key;
+        int err = 0;
+        int ret;
+        int level;
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        level = btrfs_header_level(root->node);
+        wc = kzalloc(sizeof(*wc), GFP_NOFS);
+        BUG_ON(!wc);
+        trans = btrfs_start_transaction(tree_root, 1);
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+                level = btrfs_header_level(root->node);
                path->nodes[level] = btrfs_lock_root_node(root);
                btrfs_set_lock_blocking(path->nodes[level]);
                path->slots[level] = 0;
                path->locks[level] = 1;
+                memset(&wc->update_progress, 0,
+                       sizeof(wc->update_progress));
        } else {
-                struct btrfs_key key;
-                struct btrfs_disk_key found_key;
-                struct extent_buffer *node;
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                memcpy(&wc->update_progress, &key,
+                       sizeof(wc->update_progress));
                level = root_item->drop_level;
+                BUG_ON(level == 0);
                path->lowest_level = level;
-                wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                if (wret < 0) {
+                path->lowest_level = 0;
-                        ret = wret;
+                if (ret < 0) {
+                        err = ret;
                        goto out;
                }
-                node = path->nodes[level];
+                btrfs_node_key_to_cpu(path->nodes[level], &key,
-                btrfs_node_key(node, &found_key, path->slots[level]);
+                                      path->slots[level]);
-                WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+                WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
-                               sizeof(found_key)));
                /*
                 * unlock our path, this is safe because only this
                 * function is allowed to delete this snapshot
                 */
                btrfs_unlock_up_safe(path, 0);
+                level = btrfs_header_level(root->node);
+                while (1) {
+                        btrfs_tree_lock(path->nodes[level]);
+                        btrfs_set_lock_blocking(path->nodes[level]);
+                        ret = btrfs_lookup_extent_info(trans, root,
+                                                path->nodes[level]->start,
+                                                path->nodes[level]->len,
+                                                &wc->refs[level],
+                                                &wc->flags[level]);
+                        BUG_ON(ret);
+                        BUG_ON(wc->refs[level] == 0);
+                        if (level == root_item->drop_level)
+                                break;
+                        btrfs_tree_unlock(path->nodes[level]);
+                        WARN_ON(wc->refs[level] != 1);
+                        level--;
+                }
        }
+        wc->level = level;
+        wc->shared_level = -1;
+        wc->stage = DROP_REFERENCE;
+        wc->update_ref = update_ref;
+        wc->keep_locks = 0;
        while (1) {
-                unsigned long update;
+                ret = walk_down_tree(trans, root, path, wc);
-                wret = walk_down_tree(trans, root, path, &level);
+                if (ret < 0) {
-                if (wret > 0)
+                        err = ret;
                        break;
-                if (wret < 0)
+                }
-                        ret = wret;
-                wret = walk_up_tree(trans, root, path, &level,
+                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
-                                    BTRFS_MAX_LEVEL);
+                if (ret < 0) {
-                if (wret > 0)
+                        err = ret;
                        break;
-                if (wret < 0)
+                }
-                        ret = wret;
-                if (trans->transaction->in_commit ||
+                if (ret > 0) {
-                    trans->transaction->delayed_refs.flushing) {
+                        BUG_ON(wc->stage != DROP_REFERENCE);
-                        ret = -EAGAIN;
                        break;
                }
-                for (update_count = 0; update_count < 16; update_count++) {
+                if (wc->stage == DROP_REFERENCE) {
+                        level = wc->level;
+                        btrfs_node_key(path->nodes[level],
+                                       &root_item->drop_progress,
+                                       path->slots[level]);
+                        root_item->drop_level = level;
+                }
+                BUG_ON(wc->level == 0);
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing) {
+                        ret = btrfs_update_root(trans, tree_root,
+                                                &root->root_key,
+                                                root_item);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, tree_root);
+                        trans = btrfs_start_transaction(tree_root, 1);
+                } else {
+                        unsigned long update;
                        update = trans->delayed_ref_updates;
                        trans->delayed_ref_updates = 0;
                        if (update)
-                                btrfs_run_delayed_refs(trans, root, update);
+                                btrfs_run_delayed_refs(trans, tree_root,
-                        else
+                                                       update);
-                                break;
                }
        }
+        btrfs_release_path(root, path);
+        BUG_ON(err);
+        ret = btrfs_del_root(trans, tree_root, &root->root_key);
+        BUG_ON(ret);
+        free_extent_buffer(root->node);
+        free_extent_buffer(root->commit_root);
+        kfree(root);
 out:
+        btrfs_end_transaction(trans, tree_root);
+        kfree(wc);
        btrfs_free_path(path);
-        return ret;
+        return err;
 }
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent)
 {
        struct btrfs_path *path;
+        struct walk_control *wc;
        int level;
        int parent_level;
        int ret = 0;
        int wret;
+        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        wc = kzalloc(sizeof(*wc), GFP_NOFS);
+        BUG_ON(!wc);
        btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
        extent_buffer_get(parent);
@@ -4817,24 +5243,33 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        btrfs_assert_tree_locked(node);
        level = btrfs_header_level(node);
-        extent_buffer_get(node);
        path->nodes[level] = node;
        path->slots[level] = 0;
+        path->locks[level] = 1;
+        wc->refs[parent_level] = 1;
+        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        wc->level = level;
+        wc->shared_level = -1;
+        wc->stage = DROP_REFERENCE;
+        wc->update_ref = 0;
+        wc->keep_locks = 1;
        while (1) {
-                wret = walk_down_tree(trans, root, path, &level);
+                wret = walk_down_tree(trans, root, path, wc);
-                if (wret < 0)
+                if (wret < 0) {
                        ret = wret;
-                if (wret != 0)
                        break;
+                }
-                wret = walk_up_tree(trans, root, path, &level, parent_level);
+                wret = walk_up_tree(trans, root, path, wc, parent_level);
                if (wret < 0)
                        ret = wret;
                if (wret != 0)
                        break;
        }
+        kfree(wc);
        btrfs_free_path(path);
        return ret;
 }
@@ -6739,11 +7174,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                         &info->block_group_cache_tree);
                spin_unlock(&info->block_group_cache_lock);
-                btrfs_remove_free_space_cache(block_group);
                down_write(&block_group->space_info->groups_sem);
                list_del(&block_group->list);
                up_write(&block_group->space_info->groups_sem);
+                if (block_group->cached == BTRFS_CACHE_STARTED)
+                        wait_event(block_group->caching_q,
+                                   block_group_cache_done(block_group));
+                btrfs_remove_free_space_cache(block_group);
                WARN_ON(atomic_read(&block_group->count) != 1);
                kfree(block_group);
@@ -6809,9 +7249,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
                spin_lock_init(&cache->tree_lock);
-                mutex_init(&cache->cache_mutex);
+                cache->fs_info = info;
+                init_waitqueue_head(&cache->caching_q);
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
+                /*
+                 * we only want to have 32k of ram per block group for keeping
+                 * track of free space, and if we pass 1/2 of that we want to
+                 * start converting things over to using bitmaps
+                 */
+                cache->extents_thresh = ((1024 * 32) / 2) /
+                        sizeof(struct btrfs_free_space);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -6820,6 +7270,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(root, path);
                cache->flags = btrfs_block_group_flags(&cache->item);
+                cache->sectorsize = root->sectorsize;
+                remove_sb_from_cache(root, cache);
+                /*
+                 * check for two cases, either we are full, and therefore
+                 * don't need to bother with the caching work since we won't
+                 * find any space, or we are empty, and we can just add all
+                 * the space in and be done with it.  This saves us _alot_ of
+                 * time, particularly in the full case.
+                 */
+                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                } else if (btrfs_block_group_used(&cache->item) == 0) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                        add_new_free_space(cache, root->fs_info,
+                                           found_key.objectid,
+                                           found_key.objectid +
+                                           found_key.offset);
+                }
                ret = update_space_info(info, cache->flags, found_key.offset,
                                        btrfs_block_group_used(&cache->item),
@@ -6863,10 +7333,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        cache->sectorsize = root->sectorsize;
+        /*
+         * we only want to have 32k of ram per block group for keeping track
+         * of free space, and if we pass 1/2 of that we want to start
+         * converting things over to using bitmaps
+         */
+        cache->extents_thresh = ((1024 * 32) / 2) /
+                sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        spin_lock_init(&cache->tree_lock);
-        mutex_init(&cache->cache_mutex);
+        init_waitqueue_head(&cache->caching_q);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
@@ -6875,6 +7354,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
+        cache->cached = BTRFS_CACHE_FINISHED;
+        remove_sb_from_cache(root, cache);
+        add_new_free_space(cache, root->fs_info, chunk_offset,
+                           chunk_offset + size);
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
@@ -6933,7 +7418,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
        spin_unlock(&root->fs_info->block_group_cache_lock);
-        btrfs_remove_free_space_cache(block_group);
        down_write(&block_group->space_info->groups_sem);
        /*
         * we must use list_del_init so people can check to see if they
@@ -6942,11 +7427,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        list_del_init(&block_group->list);
        up_write(&block_group->space_info->groups_sem);
+        if (block_group->cached == BTRFS_CACHE_STARTED)
+                wait_event(block_group->caching_q,
+                           block_group_cache_done(block_group));
+        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        spin_unlock(&block_group->space_info->lock);
-        block_group->space_info->full = 0;
+        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 126477eaecf5..4b833972273a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,7 +22,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
 #include <linux/swap.h>
@@ -151,7 +150,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        }
        if (end_pos > isize) {
                i_size_write(inode, end_pos);
-                btrfs_update_inode(trans, root, inode);
+                /* we've only changed i_size in ram, and we haven't updated
+                 * the disk i_size.  There is no need to log the inode
+                 * at this time.
+                 */
        }
        err = btrfs_end_transaction(trans, root);
 out_unlock:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
-struct btrfs_free_space {
+#define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
-        struct rb_node bytes_index;
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-        struct rb_node offset_index;
-        u64 offset;
-        u64 bytes;
-};
-static int tree_insert_offset(struct rb_root *root, u64 offset,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
-                              struct rb_node *node)
+                                          u64 offset)
 {
-        struct rb_node **p = &root->rb_node;
+        BUG_ON(offset < bitmap_start);
-        struct rb_node *parent = NULL;
+        offset -= bitmap_start;
-        struct btrfs_free_space *info;
+        return (unsigned long)(div64_u64(offset, sectorsize));
+}
-        while (*p) {
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
-                parent = *p;
+{
-                info = rb_entry(parent, struct btrfs_free_space, offset_index);
+        return (unsigned long)(div64_u64(bytes, sectorsize));
+}
-                if (offset < info->offset)
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
-                        p = &(*p)->rb_left;
+                                   u64 offset)
-                else if (offset > info->offset)
+{
-                        p = &(*p)->rb_right;
+        u64 bitmap_start;
-                else
+        u64 bytes_per_bitmap;
-                        return -EEXIST;
-        }
-        rb_link_node(node, parent, p);
+        bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
-        rb_insert_color(node, root);
+        bitmap_start = offset - block_group->key.objectid;
+        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+        bitmap_start *= bytes_per_bitmap;
+        bitmap_start += block_group->key.objectid;
-        return 0;
+        return bitmap_start;
 }
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+static int tree_insert_offset(struct rb_root *root, u64 offset,
-                             struct rb_node *node)
+                              struct rb_node *node, int bitmap)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
        while (*p) {
                parent = *p;
-                info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+                info = rb_entry(parent, struct btrfs_free_space, offset_index);
-                if (bytes < info->bytes)
+                if (offset < info->offset) {
                        p = &(*p)->rb_left;
-                else
+                } else if (offset > info->offset) {
                        p = &(*p)->rb_right;
+                } else {
+                        /*
+                         * we could have a bitmap entry and an extent entry
+                         * share the same offset.  If this is the case, we want
+                         * the extent entry to always be found first if we do a
+                         * linear search through the tree, since we want to have
+                         * the quickest allocation time, and allocating from an
+                         * extent is faster than allocating from a bitmap.  So
+                         * if we're inserting a bitmap and we find an entry at
+                         * this offset, we want to go right, or after this entry
+                         * logically.  If we are inserting an extent and we've
+                         * found a bitmap, we want to go left, or before
+                         * logically.
+                         */
+                        if (bitmap) {
+                                WARN_ON(info->bitmap);
+                                p = &(*p)->rb_right;
+                        } else {
+                                WARN_ON(!info->bitmap);
+                                p = &(*p)->rb_left;
+                        }
+                }
        }
        rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 /*
 * searches the tree for the given offset.
 *
- * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
- * to look for free space.  Because the hint may not be completely on an offset
+ * want a section that has at least bytes size and comes at or after the given
- * mark, or the hint may no longer point to free space we need to fudge our
+ * offset.
- * results a bit.  So we look for free space starting at or after offset with at
- * least bytes size.  We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search.  Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
 */
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+static struct btrfs_free_space *
-                                                   u64 offset, u64 bytes,
+tree_search_offset(struct btrfs_block_group_cache *block_group,
-                                                   int fuzzy)
+                   u64 offset, int bitmap_only, int fuzzy)
 {
-        struct rb_node *n = root->rb_node;
+        struct rb_node *n = block_group->free_space_offset.rb_node;
-        struct btrfs_free_space *entry, *ret = NULL;
+        struct btrfs_free_space *entry, *prev = NULL;
+        /* find entry that is closest to the 'offset' */
+        while (1) {
+                if (!n) {
+                        entry = NULL;
+                        break;
+                }
-        while (n) {
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                prev = entry;
-                if (offset < entry->offset) {
+                if (offset < entry->offset)
-                        if (fuzzy &&
-                            (!ret || entry->offset < ret->offset) &&
-                            (bytes <= entry->bytes))
-                                ret = entry;
                        n = n->rb_left;
-                } else if (offset > entry->offset) {
+                else if (offset > entry->offset)
-                        if (fuzzy &&
-                            (entry->offset + entry->bytes - 1) >= offset &&
-                            bytes <= entry->bytes) {
-                                ret = entry;
-                                break;
-                        }
                        n = n->rb_right;
-                } else {
+                else
-                        if (bytes > entry->bytes) {
-                                n = n->rb_right;
-                                continue;
-                        }
-                        ret = entry;
                        break;
-                }
        }
-        return ret;
+        if (bitmap_only) {
-}
+                if (!entry)
+                        return NULL;
+                if (entry->bitmap)
+                        return entry;
-/*
+                /*
- * return a chunk at least bytes size, as close to offset that we can get.
+                 * bitmap entry and extent entry may share same offset,
- */
+                 * in that case, bitmap entry comes after extent entry.
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+                 */
-                                                  u64 offset, u64 bytes)
+                n = rb_next(n);
-{
+                if (!n)
-        struct rb_node *n = root->rb_node;
+                        return NULL;
-        struct btrfs_free_space *entry, *ret = NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (entry->offset != offset)
-        while (n) {
+                        return NULL;
-                entry = rb_entry(n, struct btrfs_free_space, bytes_index);
-                if (bytes < entry->bytes) {
+                WARN_ON(!entry->bitmap);
+                return entry;
+        } else if (entry) {
+                if (entry->bitmap) {
                        /*
-                         * We prefer to get a hole size as close to the size we
+                         * if previous extent entry covers the offset,
-                         * are asking for so we don't take small slivers out of
+                         * we should return it instead of the bitmap entry
-                         * huge holes, but we also want to get as close to the
-                         * offset as possible so we don't have a whole lot of
-                         * fragmentation.
                         */
-                        if (offset <= entry->offset) {
+                        n = &entry->offset_index;
-                                if (!ret)
+                        while (1) {
-                                        ret = entry;
+                                n = rb_prev(n);
-                                else if (entry->bytes < ret->bytes)
+                                if (!n)
-                                        ret = entry;
+                                        break;
-                                else if (entry->offset < ret->offset)
+                                prev = rb_entry(n, struct btrfs_free_space,
-                                        ret = entry;
+                                                offset_index);
+                                if (!prev->bitmap) {
+                                        if (prev->offset + prev->bytes > offset)
+                                                entry = prev;
+                                        break;
+                                }
                        }
-                        n = n->rb_left;
+                }
-                } else if (bytes > entry->bytes) {
+                return entry;
-                        n = n->rb_right;
+        }
+        if (!prev)
+                return NULL;
+        /* find last entry before the 'offset' */
+        entry = prev;
+        if (entry->offset > offset) {
+                n = rb_prev(&entry->offset_index);
+                if (n) {
+                        entry = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        BUG_ON(entry->offset > offset);
                } else {
-                        /*
+                        if (fuzzy)
-                         * Ok we may have multiple chunks of the wanted size,
+                                return entry;
-                         * so we don't want to take the first one we find, we
+                        else
-                         * want to take the one closest to our given offset, so
+                                return NULL;
-                         * keep searching just in case theres a better match.
-                         */
-                        n = n->rb_right;
-                        if (offset > entry->offset)
-                                continue;
-                        else if (!ret || entry->offset < ret->offset)
-                                ret = entry;
                }
        }
-        return ret;
+        if (entry->bitmap) {
+                n = &entry->offset_index;
+                while (1) {
+                        n = rb_prev(n);
+                        if (!n)
+                                break;
+                        prev = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        if (!prev->bitmap) {
+                                if (prev->offset + prev->bytes > offset)
+                                        return prev;
+                                break;
+                        }
+                }
+                if (entry->offset + BITS_PER_BITMAP *
+                    block_group->sectorsize > offset)
+                        return entry;
+        } else if (entry->offset + entry->bytes > offset)
+                return entry;
+        if (!fuzzy)
+                return NULL;
+        while (1) {
+                if (entry->bitmap) {
+                        if (entry->offset + BITS_PER_BITMAP *
+                            block_group->sectorsize > offset)
+                                break;
+                } else {
+                        if (entry->offset + entry->bytes > offset)
+                                break;
+                }
+                n = rb_next(&entry->offset_index);
+                if (!n)
+                        return NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+        }
+        return entry;
 }
 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_free_space *info)
 {
        rb_erase(&info->offset_index, &block_group->free_space_offset);
-        rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+        block_group->free_extents--;
+        block_group->free_space -= info->bytes;
 }
 static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 {
        int ret = 0;
+        BUG_ON(!info->bitmap && !info->bytes);
-        BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
-                                 &info->offset_index);
+                                 &info->offset_index, (info->bitmap != NULL));
        if (ret)
                return ret;
-        ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+        block_group->free_space += info->bytes;
-                                &info->bytes_index);
+        block_group->free_extents++;
-        if (ret)
+        return ret;
-                return ret;
+}
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+        u64 max_bytes, possible_bytes;
+        /*
+         * The goal is to keep the total amount of memory used per 1gb of space
+         * at or below 32k, so we need to adjust how much memory we allow to be
+         * used by extent based free space tracking
+         */
+        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+        possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+                (sizeof(struct btrfs_free_space) *
+                 block_group->extents_thresh);
+        if (possible_bytes > max_bytes) {
+                int extent_bytes = max_bytes -
+                        (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+                if (extent_bytes <= 0) {
+                        block_group->extents_thresh = 0;
+                        return;
+                }
+                block_group->extents_thresh = extent_bytes /
+                        (sizeof(struct btrfs_free_space));
+        }
+}
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info, u64 offset,
+                              u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                clear_bit(i, info->bitmap);
+        info->bytes -= bytes;
+        block_group->free_space -= bytes;
+}
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_space *info, u64 offset,
+                            u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                set_bit(i, info->bitmap);
+        info->bytes += bytes;
+        block_group->free_space += bytes;
+}
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+                         struct btrfs_free_space *bitmap_info, u64 *offset,
+                         u64 *bytes)
+{
+        unsigned long found_bits = 0;
+        unsigned long bits, i;
+        unsigned long next_zero;
+        i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+                          max_t(u64, *offset, bitmap_info->offset));
+        bits = bytes_to_bits(*bytes, block_group->sectorsize);
+        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(bitmap_info->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if ((next_zero - i) >= bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (found_bits) {
+                *offset = (u64)(i * block_group->sectorsize) +
+                        bitmap_info->offset;
+                *bytes = (u64)(found_bits) * block_group->sectorsize;
+                return 0;
+        }
+        return -1;
+}
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+                                                *block_group, u64 *offset,
+                                                u64 *bytes, int debug)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret;
+        if (!block_group->free_space_offset.rb_node)
+                return NULL;
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, *offset),
+                                   0, 1);
+        if (!entry)
+                return NULL;
+        for (node = &entry->offset_index; node; node = rb_next(node)) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (entry->bytes < *bytes)
+                        continue;
+                if (entry->bitmap) {
+                        ret = search_bitmap(block_group, entry, offset, bytes);
+                        if (!ret)
+                                return entry;
+                        continue;
+                }
+                *offset = entry->offset;
+                *bytes = entry->bytes;
+                return entry;
+        }
+        return NULL;
+}
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info, u64 offset)
+{
+        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        int max_bitmaps = (int)div64_u64(block_group->key.offset +
+                                         bytes_per_bg - 1, bytes_per_bg);
+        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+        info->offset = offset_to_bitmap(block_group, offset);
+        link_free_space(block_group, info);
+        block_group->total_bitmaps++;
+        recalculate_thresholds(block_group);
+}
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *bitmap_info,
+                              u64 *offset, u64 *bytes)
+{
+        u64 end;
+        u64 search_start, search_bytes;
+        int ret;
+again:
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+        /*
+         * XXX - this can go away after a few releases.
+         *
+         * since the only user of btrfs_remove_free_space is the tree logging
+         * stuff, and the only way to test that is under crash conditions, we
+         * want to have this debug stuff here just in case somethings not
+         * working.  Search the bitmap for the space we are trying to use to
+         * make sure its actually there.  If its not there then we need to stop
+         * because something has gone wrong.
+         */
+        search_start = *offset;
+        search_bytes = *bytes;
+        ret = search_bitmap(block_group, bitmap_info, &search_start,
+                            &search_bytes);
+        BUG_ON(ret < 0 || search_start != *offset);
+        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset,
+                                  end - *offset + 1);
+                *bytes -= end - *offset + 1;
+                *offset = end + 1;
+        } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+                *bytes = 0;
+        }
+        if (*bytes) {
+                struct rb_node *next = rb_next(&bitmap_info->offset_index);
+                if (!bitmap_info->bytes) {
+                        unlink_free_space(block_group, bitmap_info);
+                        kfree(bitmap_info->bitmap);
+                        kfree(bitmap_info);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+                /*
+                 * no entry after this bitmap, but we still have bytes to
+                 * remove, so something has gone wrong.
+                 */
+                if (!next)
+                        return -EINVAL;
+                bitmap_info = rb_entry(next, struct btrfs_free_space,
+                                       offset_index);
+                /*
+                 * if the next entry isn't a bitmap we need to return to let the
+                 * extent stuff do its work.
+                 */
+                if (!bitmap_info->bitmap)
+                        return -EAGAIN;
+                /*
+                 * Ok the next item is a bitmap, but it may not actually hold
+                 * the information for the rest of this free space stuff, so
+                 * look for it, and if we don't find it return so we can try
+                 * everything over again.
+                 */
+                search_start = *offset;
+                search_bytes = *bytes;
+                ret = search_bitmap(block_group, bitmap_info, &search_start,
+                                    &search_bytes);
+                if (ret < 0 || search_start != *offset)
+                        return -EAGAIN;
+                goto again;
+        } else if (!bitmap_info->bytes) {
+                unlink_free_space(block_group, bitmap_info);
+                kfree(bitmap_info->bitmap);
+                kfree(bitmap_info);
+                block_group->total_bitmaps--;
+                recalculate_thresholds(block_group);
+        }
+        return 0;
+}
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        struct btrfs_free_space *bitmap_info;
+        int added = 0;
+        u64 bytes, offset, end;
+        int ret;
+        /*
+         * If we are below the extents threshold then we can add this as an
+         * extent, and don't have to deal with the bitmap
+         */
+        if (block_group->free_extents < block_group->extents_thresh &&
+            info->bytes > block_group->sectorsize * 4)
+                return 0;
+        /*
+         * some block groups are so tiny they can't be enveloped by a bitmap, so
+         * don't even bother to create a bitmap for this
+         */
+        if (BITS_PER_BITMAP * block_group->sectorsize >
+            block_group->key.offset)
+                return 0;
+        bytes = info->bytes;
+        offset = info->offset;
+again:
+        bitmap_info = tree_search_offset(block_group,
+                                         offset_to_bitmap(block_group, offset),
+                                         1, 0);
+        if (!bitmap_info) {
+                BUG_ON(added);
+                goto new_bitmap;
+        }
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+        if (offset >= bitmap_info->offset && offset + bytes > end) {
+                bitmap_set_bits(block_group, bitmap_info, offset,
+                                end - offset);
+                bytes -= end - offset;
+                offset = end;
+                added = 0;
+        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+                bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+                bytes = 0;
+        } else {
+                BUG();
+        }
+        if (!bytes) {
+                ret = 1;
+                goto out;
+        } else
+                goto again;
+new_bitmap:
+        if (info && info->bitmap) {
+                add_new_bitmap(block_group, info, offset);
+                added = 1;
+                info = NULL;
+                goto again;
+        } else {
+                spin_unlock(&block_group->tree_lock);
+                /* no pre-allocated info, allocate a new one */
+                if (!info) {
+                        info = kzalloc(sizeof(struct btrfs_free_space),
+                                       GFP_NOFS);
+                        if (!info) {
+                                spin_lock(&block_group->tree_lock);
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                }
+                /* allocate the bitmap */
+                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                spin_lock(&block_group->tree_lock);
+                if (!info->bitmap) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                goto again;
+        }
+out:
+        if (info) {
+                if (info->bitmap)
+                        kfree(info->bitmap);
+                kfree(info);
+        }
        return ret;
 }
@@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 offset, u64 bytes)
 {
-        struct btrfs_free_space *right_info;
+        struct btrfs_free_space *right_info = NULL;
-        struct btrfs_free_space *left_info;
+        struct btrfs_free_space *left_info = NULL;
        struct btrfs_free_space *info = NULL;
        int ret = 0;
@@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
-        right_info = tree_search_offset(&block_group->free_space_offset,
+        right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
-                                        offset+bytes, 0, 0);
+        if (right_info && rb_prev(&right_info->offset_index))
-        left_info = tree_search_offset(&block_group->free_space_offset,
+                left_info = rb_entry(rb_prev(&right_info->offset_index),
-                                       offset-1, 0, 1);
+                                     struct btrfs_free_space, offset_index);
+        else
+                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+        /*
+         * If there was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        if ((!left_info || left_info->bitmap) &&
+            (!right_info || right_info->bitmap)) {
+                ret = insert_into_bitmap(block_group, info);
+                if (ret < 0) {
+                        goto out;
+                } else if (ret) {
+                        ret = 0;
+                        goto out;
+                }
+        }
-        if (right_info) {
+        if (right_info && !right_info->bitmap) {
                unlink_free_space(block_group, right_info);
                info->bytes += right_info->bytes;
                kfree(right_info);
        }
-        if (left_info && left_info->offset + left_info->bytes == offset) {
+        if (left_info && !left_info->bitmap &&
+            left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
@@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
+out:
        spin_unlock(&block_group->tree_lock);
        if (ret) {
-                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
                BUG_ON(ret == -EEXIST);
        }
@@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
+        struct btrfs_free_space *next_info = NULL;
        int ret = 0;
        spin_lock(&block_group->tree_lock);
-        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+again:
-                                  1);
+        info = tree_search_offset(block_group, offset, 0, 0);
-        if (info && info->offset == offset) {
+        if (!info) {
-                if (info->bytes < bytes) {
+                /*
-                        printk(KERN_ERR "Found free space at %llu, size %llu,"
+                 * oops didn't find an extent that matched the space we wanted
-                               "trying to use %llu\n",
+                 * to remove, look for a bitmap instead
-                               (unsigned long long)info->offset,
+                 */
-                               (unsigned long long)info->bytes,
+                info = tree_search_offset(block_group,
-                               (unsigned long long)bytes);
+                                          offset_to_bitmap(block_group, offset),
+                                          1, 0);
+                if (!info) {
+                        WARN_ON(1);
+                        goto out_lock;
+                }
+        }
+        if (info->bytes < bytes && rb_next(&info->offset_index)) {
+                u64 end;
+                next_info = rb_entry(rb_next(&info->offset_index),
+                                             struct btrfs_free_space,
+                                             offset_index);
+                if (next_info->bitmap)
+                        end = next_info->offset + BITS_PER_BITMAP *
+                                block_group->sectorsize - 1;
+                else
+                        end = next_info->offset + next_info->bytes;
+                if (next_info->bytes < bytes ||
+                    next_info->offset > offset || offset > end) {
+                        printk(KERN_CRIT "Found free space at %llu, size %llu,"
+                              " trying to use %llu\n",
+                              (unsigned long long)info->offset,
+                              (unsigned long long)info->bytes,
+                              (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
-                        spin_unlock(&block_group->tree_lock);
+                        goto out_lock;
-                        goto out;
                }
-                unlink_free_space(block_group, info);
-                if (info->bytes == bytes) {
+                info = next_info;
-                        kfree(info);
+        }
-                        spin_unlock(&block_group->tree_lock);
-                        goto out;
+        if (info->bytes == bytes) {
+                unlink_free_space(block_group, info);
+                if (info->bitmap) {
+                        kfree(info->bitmap);
+                        block_group->total_bitmaps--;
                }
+                kfree(info);
+                goto out_lock;
+        }
+        if (!info->bitmap && info->offset == offset) {
+                unlink_free_space(block_group, info);
                info->offset += bytes;
                info->bytes -= bytes;
+                link_free_space(block_group, info);
+                goto out_lock;
+        }
-                ret = link_free_space(block_group, info);
+        if (!info->bitmap && info->offset <= offset &&
-                spin_unlock(&block_group->tree_lock);
+            info->offset + info->bytes >= offset + bytes) {
-                BUG_ON(ret);
-        } else if (info && info->offset < offset &&
-                   info->offset + info->bytes >= offset + bytes) {
                u64 old_start = info->offset;
                /*
                 * we're freeing space in the middle of the info,
@@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        info->offset = offset + bytes;
                        info->bytes = old_end - info->offset;
                        ret = link_free_space(block_group, info);
-                        BUG_ON(ret);
+                        WARN_ON(ret);
+                        if (ret)
+                                goto out_lock;
                } else {
                        /* the hole we're creating ends at the end
                         * of the info struct, just free the info
@@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        kfree(info);
                }
                spin_unlock(&block_group->tree_lock);
-                /* step two, insert a new info struct to cover anything
-                 * before the hole
+                /* step two, insert a new info struct to cover
+                 * anything before the hole
                 */
                ret = btrfs_add_free_space(block_group, old_start,
                                           offset - old_start);
-                BUG_ON(ret);
+                WARN_ON(ret);
-        } else {
+                goto out;
-                spin_unlock(&block_group->tree_lock);
-                if (!info) {
-                        printk(KERN_ERR "couldn't find space %llu to free\n",
-                               (unsigned long long)offset);
-                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-                               block_group->cached,
-                               (unsigned long long)block_group->key.objectid,
-                               (unsigned long long)block_group->key.offset);
-                        btrfs_dump_free_space(block_group, bytes);
-                } else if (info) {
-                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
-                               "but wanted offset=%llu bytes=%llu\n",
-                               (unsigned long long)info->offset,
-                               (unsigned long long)info->bytes,
-                               (unsigned long long)offset,
-                               (unsigned long long)bytes);
-                }
-                WARN_ON(1);
        }
+        ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+        if (ret == -EAGAIN)
+                goto again;
+        BUG_ON(ret);
+out_lock:
+        spin_unlock(&block_group->tree_lock);
 out:
        return ret;
 }
@@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
-                printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+                printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
                       (unsigned long long)info->offset,
-                       (unsigned long long)info->bytes);
+                       (unsigned long long)info->bytes,
+                       (info->bitmap) ? "yes" : "no");
        }
+        printk(KERN_INFO "block group has cluster?: %s\n",
+               list_empty(&block_group->cluster_list) ? "no" : "yes");
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
 }
@@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space(
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
+        bool bitmap;
        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group)
                goto out;
+        bitmap = cluster->points_to_bitmap;
+        cluster->block_group = NULL;
        cluster->window_start = 0;
+        list_del_init(&cluster->block_group_list);
+        cluster->points_to_bitmap = false;
+        if (bitmap)
+                goto out;
        node = rb_first(&cluster->root);
-        while(node) {
+        while (node) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
-                link_free_space(block_group, entry);
+                BUG_ON(entry->bitmap);
+                tree_insert_offset(&block_group->free_space_offset,
+                                   entry->offset, &entry->offset_index, 0);
        }
-        list_del_init(&cluster->block_group_list);
-        btrfs_put_block_group(cluster->block_group);
-        cluster->block_group = NULL;
        cluster->root.rb_node = NULL;
 out:
        spin_unlock(&cluster->lock);
+        btrfs_put_block_group(block_group);
        return 0;
 }
@@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
        struct btrfs_free_space *info;
        struct rb_node *node;
        struct btrfs_free_cluster *cluster;
-        struct btrfs_free_cluster *safe;
+        struct list_head *head;
        spin_lock(&block_group->tree_lock);
+        while ((head = block_group->cluster_list.next) !=
-        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+               &block_group->cluster_list) {
-                                 block_group_list) {
+                cluster = list_entry(head, struct btrfs_free_cluster,
+                                     block_group_list);
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
+                if (need_resched()) {
+                        spin_unlock(&block_group->tree_lock);
+                        cond_resched();
+                        spin_lock(&block_group->tree_lock);
+                }
        }
-        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-                info = rb_entry(node, struct btrfs_free_space, bytes_index);
+                info = rb_entry(node, struct btrfs_free_space, offset_index);
                unlink_free_space(block_group, info);
+                if (info->bitmap)
+                        kfree(info->bitmap);
                kfree(info);
                if (need_resched()) {
                        spin_unlock(&block_group->tree_lock);
@@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                        spin_lock(&block_group->tree_lock);
                }
        }
        spin_unlock(&block_group->tree_lock);
 }
@@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space *entry = NULL;
+        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
        spin_lock(&block_group->tree_lock);
-        entry = tree_search_offset(&block_group->free_space_offset, offset,
+        entry = find_free_space(block_group, &offset, &bytes_search, 0);
-                                   bytes + empty_size, 1);
        if (!entry)
-                entry = tree_search_bytes(&block_group->free_space_bytes,
+                goto out;
-                                          offset, bytes + empty_size);
-        if (entry) {
+        ret = offset;
+        if (entry->bitmap) {
+                bitmap_clear_bits(block_group, entry, offset, bytes);
+                if (!entry->bytes) {
+                        unlink_free_space(block_group, entry);
+                        kfree(entry->bitmap);
+                        kfree(entry);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+        } else {
                unlink_free_space(block_group, entry);
-                ret = entry->offset;
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
                        kfree(entry);
                else
                        link_free_space(block_group, entry);
        }
+out:
        spin_unlock(&block_group->tree_lock);
        return ret;
@@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space(
        return ret;
 }
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+                                   struct btrfs_free_cluster *cluster,
+                                   u64 bytes, u64 min_start)
+{
+        struct btrfs_free_space *entry;
+        int err;
+        u64 search_start = cluster->window_start;
+        u64 search_bytes = bytes;
+        u64 ret = 0;
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        if (!cluster->points_to_bitmap)
+                goto out;
+        if (cluster->block_group != block_group)
+                goto out;
+        /*
+         * search_start is the beginning of the bitmap, but at some point it may
+         * be a good idea to point to the actual start of the free area in the
+         * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+         * to 1 to make sure we get the bitmap entry
+         */
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, search_start),
+                                   1, 0);
+        if (!entry || !entry->bitmap)
+                goto out;
+        search_start = min_start;
+        search_bytes = bytes;
+        err = search_bitmap(block_group, entry, &search_start,
+                            &search_bytes);
+        if (err)
+                goto out;
+        ret = search_start;
+        bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
+        return ret;
+}
 /*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        struct rb_node *node;
        u64 ret = 0;
+        if (cluster->points_to_bitmap)
+                return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+                                               min_start);
        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;
@@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        }
 out:
        spin_unlock(&cluster->lock);
        return ret;
 }
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+                                struct btrfs_free_space *entry,
+                                struct btrfs_free_cluster *cluster,
+                                u64 offset, u64 bytes, u64 min_bytes)
+{
+        unsigned long next_zero;
+        unsigned long i;
+        unsigned long search_bits;
+        unsigned long total_bits;
+        unsigned long found_bits;
+        unsigned long start = 0;
+        unsigned long total_found = 0;
+        bool found = false;
+        i = offset_to_bit(entry->offset, block_group->sectorsize,
+                          max_t(u64, offset, entry->offset));
+        search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+again:
+        found_bits = 0;
+        for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(entry->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if (next_zero - i >= search_bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (!found_bits)
+                return -1;
+        if (!found) {
+                start = i;
+                found = true;
+        }
+        total_found += found_bits;
+        if (cluster->max_size < found_bits * block_group->sectorsize)
+                cluster->max_size = found_bits * block_group->sectorsize;
+        if (total_found < total_bits) {
+                i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+                if (i - start > total_bits * 2) {
+                        total_found = 0;
+                        cluster->max_size = 0;
+                        found = false;
+                }
+                goto again;
+        }
+        cluster->window_start = start * block_group->sectorsize +
+                entry->offset;
+        cluster->points_to_bitmap = true;
+        return 0;
+}
 /*
 * here we try to find a cluster of blocks in a block group.  The goal
 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1172,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        struct btrfs_free_space *next;
-        struct btrfs_free_space *last;
+        struct btrfs_free_space *last = NULL;
        u64 min_bytes;
        u64 window_start;
        u64 window_free;
        u64 max_extent = 0;
-        int total_retries = 0;
+        bool found_bitmap = false;
        int ret;
        /* for metadata, allow allocates with more holes */
@@ -620,31 +1205,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
 again:
-        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-        entry = tree_search_bytes(&block_group->free_space_bytes,
-                                  offset, min_bytes);
        if (!entry) {
                ret = -ENOSPC;
                goto out;
        }
+        /*
+         * If found_bitmap is true, we exhausted our search for extent entries,
+         * and we just want to search all of the bitmaps that we can find, and
+         * ignore any extent entries we find.
+         */
+        while (entry->bitmap || found_bitmap ||
+               (!entry->bitmap && entry->bytes < min_bytes)) {
+                struct rb_node *node = rb_next(&entry->offset_index);
+                if (entry->bitmap && entry->bytes > bytes + empty_size) {
+                        ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+                                                   offset, bytes + empty_size,
+                                                   min_bytes);
+                        if (!ret)
+                                goto got_it;
+                }
+                if (!node) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        /*
+         * We already searched all the extent entries from the passed in offset
+         * to the end and didn't find enough space for the cluster, and we also
+         * didn't find any bitmaps that met our criteria, just go ahead and exit
+         */
+        if (found_bitmap) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        cluster->points_to_bitmap = false;
        window_start = entry->offset;
        window_free = entry->bytes;
        last = entry;
        max_extent = entry->bytes;
-        while(1) {
+        while (1) {
                /* out window is just right, lets fill it */
                if (window_free >= bytes + empty_size)
                        break;
                node = rb_next(&last->offset_index);
                if (!node) {
+                        if (found_bitmap)
+                                goto again;
                        ret = -ENOSPC;
                        goto out;
                }
                next = rb_entry(node, struct btrfs_free_space, offset_index);
                /*
+                 * we found a bitmap, so if this search doesn't result in a
+                 * cluster, we know to go and search again for the bitmaps and
+                 * start looking for space there
+                 */
+                if (next->bitmap) {
+                        if (!found_bitmap)
+                                offset = next->offset;
+                        found_bitmap = true;
+                        last = next;
+                        continue;
+                }
+                /*
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
@@ -655,19 +1289,6 @@ again:
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = 0;
-                        total_retries++;
-                        if (total_retries % 64 == 0) {
-                                if (min_bytes >= (bytes + empty_size)) {
-                                        ret = -ENOSPC;
-                                        goto out;
-                                }
-                                /*
-                                 * grow our allocation a bit, we're not having
-                                 * much luck
-                                 */
-                                min_bytes *= 2;
-                                goto again;
-                        }
                } else {
                        last = next;
                        window_free += next->bytes;
@@ -685,11 +1306,19 @@ again:
         * The cluster includes an rbtree, but only uses the offset index
         * of each free space cache entry.
         */
-        while(1) {
+        while (1) {
                node = rb_next(&entry->offset_index);
-                unlink_free_space(block_group, entry);
+                if (entry->bitmap && node) {
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                } else if (entry->bitmap && !node) {
+                        break;
+                }
+                rb_erase(&entry->offset_index, &block_group->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
-                                         &entry->offset_index);
+                                         &entry->offset_index, 0);
                BUG_ON(ret);
                if (!node || entry == last)
@@ -697,8 +1326,10 @@ again:
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        ret = 0;
        cluster->max_size = max_extent;
+got_it:
+        ret = 0;
        atomic_inc(&block_group->count);
        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
        cluster->block_group = block_group;
@@ -718,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        spin_lock_init(&cluster->refill_lock);
        cluster->root.rb_node = NULL;
        cluster->max_size = 0;
+        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
 #ifndef __BTRFS_FREE_SPACE_CACHE
 #define __BTRFS_FREE_SPACE_CACHE
+struct btrfs_free_space {
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+        unsigned long *bitmap;
+        struct list_head list;
+};
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8612b3a09811..272b9b2bea86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
 #include <linux/swap.h>
@@ -2122,10 +2121,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
         * any xattrs or acls
         */
        maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
-        if (!maybe_acls) {
+        if (!maybe_acls)
-                BTRFS_I(inode)->i_acl = NULL;
+                cache_no_acl(inode);
-                BTRFS_I(inode)->i_default_acl = NULL;
-        }
        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
                                                alloc_group_block, 0);
@@ -2606,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        if (root->ref_cows)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
-        path->reada = -1;
        BUG_ON(!path);
+        path->reada = -1;
        /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
@@ -3141,9 +3138,6 @@ static noinline void init_btrfs_i(struct inode *inode)
 {
        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->i_acl = BTRFS_ACL_NOT_CACHED;
-        bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
        bi->generation = 0;
        bi->sequence = 0;
        bi->last_trans = 0;
@@ -3585,12 +3579,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                owner = 1;
        BTRFS_I(inode)->block_group =
                        btrfs_find_block_group(root, 0, alloc_hint, owner);
-        if ((mode & S_IFREG)) {
-                if (btrfs_test_opt(root, NODATASUM))
-                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-                if (btrfs_test_opt(root, NODATACOW))
-                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
-        }
        key[0].objectid = objectid;
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -3645,6 +3633,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_inherit_iflags(inode, dir);
+        if ((mode & S_IFREG)) {
+                if (btrfs_test_opt(root, NODATASUM))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+                if (btrfs_test_opt(root, NODATACOW))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+        }
        insert_inode_hash(inode);
        inode_tree_add(inode);
        return inode;
@@ -4640,8 +4635,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_trans = 0;
        ei->logged_trans = 0;
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-        ei->i_acl = BTRFS_ACL_NOT_CACHED;
-        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
        INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->ordered_operations);
        return &ei->vfs_inode;
@@ -4655,13 +4648,6 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
-        if (BTRFS_I(inode)->i_acl &&
-            BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
-                posix_acl_release(BTRFS_I(inode)->i_acl);
-        if (BTRFS_I(inode)->i_default_acl &&
-            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
-                posix_acl_release(BTRFS_I(inode)->i_default_acl);
        /*
         * Make sure we're properly removed from the ordered operation
         * lists.
@@ -4799,8 +4785,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * and the replacement file is large.  Start IO on it now so
         * we don't add too much work to the end of the transaction
         */
-        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
-            new_inode->i_size &&
            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
@@ -5096,6 +5081,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
        int ret;
        alloc_start = offset & ~mask;
@@ -5114,6 +5100,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
+        root = BTRFS_I(inode)->root;
+        ret = btrfs_check_data_free_space(root, inode,
+                                          alloc_end - alloc_start);
+        if (ret)
+                goto out;
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@ -5121,7 +5114,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
                if (!trans) {
                        ret = -EIO;
-                        goto out;
+                        goto out_free;
                }
                /* the extent lock is ordered inside the running
@@ -5182,6 +5175,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                      GFP_NOFS);
        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+out_free:
+        btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index eff18f5b5362..bd88f25889f7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/mpage.h>
@@ -1028,7 +1027,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                                struct btrfs_file_extent_item);
                        comp = btrfs_file_extent_compression(leaf, extent);
                        type = btrfs_file_extent_type(leaf, extent);
-                        if (type == BTRFS_FILE_EXTENT_REG) {
+                        if (type == BTRFS_FILE_EXTENT_REG ||
+                            type == BTRFS_FILE_EXTENT_PREALLOC) {
                                disko = btrfs_file_extent_disk_bytenr(leaf,
                                                                      extent);
                                diskl = btrfs_file_extent_disk_num_bytes(leaf,
@@ -1051,7 +1051,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        new_key.objectid = inode->i_ino;
                        new_key.offset = key.offset + destoff - off;
-                        if (type == BTRFS_FILE_EXTENT_REG) {
+                        if (type == BTRFS_FILE_EXTENT_REG ||
+                            type == BTRFS_FILE_EXTENT_PREALLOC) {
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
                                if (ret)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
        }
        printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
               (unsigned long long)btrfs_header_bytenr(c),
-               btrfs_header_level(c), nr,
+              level, nr,
               (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
        for (i = 0; i < nr; i++) {
                btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
                                        btrfs_level_size(root, level - 1),
                                        btrfs_node_ptr_generation(c, i));
                if (btrfs_is_leaf(next) &&
-                    btrfs_header_level(c) != 1)
+                   level != 1)
                        BUG();
                if (btrfs_header_level(next) !=
-                        btrfs_header_level(c) - 1)
+                       level - 1)
                        BUG();
                btrfs_print_tree(root, next);
                free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b23dc209ae10..c04f7f212602 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
                        err = ret;
                        goto out;
                }
+                if (ret > 0 && path2->slots[level] > 0)
+                        path2->slots[level]--;
                eb = path2->nodes[level];
                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(level == 0);
                path->lowest_level = level;
                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+                path->lowest_level = 0;
                if (ret < 0) {
                        btrfs_free_path(path);
                        return ret;
@@ -1788,7 +1791,7 @@ static void merge_func(struct btrfs_work *work)
                btrfs_end_transaction(trans, root);
        }
-        btrfs_drop_dead_root(reloc_root);
+        btrfs_drop_snapshot(reloc_root, 0);
        if (atomic_dec_and_test(async->num_pending))
                complete(async->done);
@@ -2075,9 +2078,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
                        BUG_ON(ret);
-                        btrfs_tree_unlock(eb);
-                        free_extent_buffer(eb);
                }
                if (!lowest) {
                        btrfs_tree_unlock(upper->eb);
@@ -2553,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
        /* make sure the dirty trick played by the caller work */
-        ret = invalidate_inode_pages2_range(inode->i_mapping,
+        while (1) {
-                                            first_index, last_index);
+                ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                                    first_index, last_index);
+                if (ret != -EBUSY)
+                        break;
+                schedule_timeout(HZ/10);
+        }
        if (ret)
                goto out_unlock;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9f179d4832d5..6d6d06cb6dfc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/mpage.h>
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2e177d7f4bb9..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        }
 }
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+        free_extent_buffer(root->commit_root);
+        root->commit_root = btrfs_root_node(root);
+}
 /*
 * either allocate a new transaction or hop into the existing one
 */
@@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        btrfs_write_dirty_block_groups(trans, root);
-        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-        BUG_ON(ret);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start)
@@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
-                btrfs_write_dirty_block_groups(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                ret = btrfs_write_dirty_block_groups(trans, root);
                BUG_ON(ret);
        }
-        free_extent_buffer(root->commit_root);
-        root->commit_root = btrfs_root_node(root);
+        if (root != root->fs_info->extent_root)
+                switch_commit_root(root);
        return 0;
 }
@@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
                update_cowonly_root(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-                BUG_ON(ret);
        }
+        down_write(&fs_info->extent_commit_sem);
+        switch_commit_root(fs_info->extent_root);
+        up_write(&fs_info->extent_commit_sem);
        return 0;
 }
@@ -543,13 +549,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
-                        if (root->commit_root == root->node)
+                        if (root->commit_root != root->node) {
-                                continue;
+                                switch_commit_root(root);
+                                btrfs_set_root_node(&root->root_item,
-                        free_extent_buffer(root->commit_root);
+                                                    root->node);
-                        root->commit_root = btrfs_root_node(root);
+                        }
-                        btrfs_set_root_node(&root->root_item, root->node);
                        err = btrfs_update_root(trans, fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
@@ -593,6 +598,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
        return 0;
 }
+#if 0
 /*
 * when dropping snapshots, we generate a ton of delayed refs, and it makes
 * sense not to join the transaction while it is trying to flush the current
@@ -681,6 +687,7 @@ int btrfs_drop_dead_root(struct btrfs_root *root)
        btrfs_btree_balance_dirty(tree_root, nr);
        return ret;
 }
+#endif
 /*
 * new snapshots need to be created at a very specific time in the
@@ -850,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)
        super->root_level = root_item->level;
 }
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->in_commit;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -941,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (flush_on_commit || snap_pending) {
+                if (flush_on_commit) {
-                        if (flush_on_commit)
+                        btrfs_start_delalloc_inodes(root);
-                                btrfs_start_delalloc_inodes(root);
+                        ret = btrfs_wait_ordered_extents(root, 0);
+                        BUG_ON(ret);
+                } else if (snap_pending) {
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
@@ -1007,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
-        free_extent_buffer(root->fs_info->tree_root->commit_root);
+        switch_commit_root(root->fs_info->tree_root);
-        root->fs_info->tree_root->commit_root =
-                                btrfs_root_node(root->fs_info->tree_root);
        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
                            root->fs_info->chunk_root->node);
-        free_extent_buffer(root->fs_info->chunk_root->commit_root);
+        switch_commit_root(root->fs_info->chunk_root);
-        root->fs_info->chunk_root->commit_root =
-                                btrfs_root_node(root->fs_info->chunk_root);
        update_super_roots(root);
@@ -1055,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
        put_transaction(cur_trans);
@@ -1081,7 +1097,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
        while (!list_empty(&list)) {
                root = list_entry(list.next, struct btrfs_root, root_list);
                list_del_init(&root->root_list);
-                btrfs_drop_dead_root(root);
+                btrfs_drop_snapshot(root, 0);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e1..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
                                        struct extent_io_tree *dirty_pages);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOENT;
        inode = read_one_inode(root, key->objectid);
-        BUG_ON(!dir);
+        BUG_ON(!inode);
        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -721,7 +721,8 @@ error:
 */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_device *device,
-                                         u64 num_bytes, u64 *start)
+                                         u64 num_bytes, u64 *start,
+                                         u64 *max_avail)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
                goto error;
-        ret = btrfs_previous_item(root, path, 0, key.type);
+        if (ret > 0) {
-        if (ret < 0)
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
-                goto error;
+                if (ret < 0)
+                        goto error;
+                if (ret > 0)
+                        start_found = 1;
+        }
        l = path->nodes[0];
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
@@ -803,6 +808,10 @@ no_more_items:
                        if (last_byte < search_start)
                                last_byte = search_start;
                        hole_size = key.offset - last_byte;
+                        if (hole_size > *max_avail)
+                                *max_avail = hole_size;
                        if (key.offset > last_byte &&
                            hole_size >= num_bytes) {
                                *start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
        device->fs_devices->total_rw_bytes += diff;
        device->total_bytes = new_size;
+        device->disk_total_bytes = new_size;
        btrfs_clear_space_info_full(device->dev_root->fs_info);
        return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
                if (ret) {
                        ret = 0;
-                        goto done;
+                        break;
                }
                l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
                if (key.objectid != device->devid)
-                        goto done;
+                        break;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                             max_chunk_size);
 again:
+        max_avail = 0;
        if (!map || map->num_stripes != num_stripes) {
                kfree(map);
                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
                if (device->in_fs_metadata && avail >= min_free) {
                        ret = find_free_dev_extent(trans, device,
-                                                   min_free, &dev_offset);
+                                                   min_free, &dev_offset,
+                                                   &max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                }
        }
-        for (i = 0; i > nr; i++) {
-                struct btrfs_multi_bio *multi;
-                struct btrfs_bio_stripe *stripe;
-                int ret;
-                length = 1;
-                ret = btrfs_map_block(map_tree, WRITE, buf[i],
-                                      &length, &multi, 0);
-                BUG_ON(ret);
-                stripe = multi->stripes;
-                for (j = 0; j < multi->num_stripes; j++) {
-                        if (stripe->physical >= physical &&
-                            physical < stripe->physical + length)
-                                break;
-                }
-                BUG_ON(j >= multi->num_stripes);
-                kfree(multi);
-        }
        *logical = buf;
        *naddrs = nr;
        *stripe_len = map->stripe_len;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        *total_in = 0;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -1;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        char *kaddr;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
                return -ENOMEM;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        workspace->inf_strm.next_in = data_in;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index b7c9d5187a75..a173551e19d7 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -13,7 +13,6 @@
 #include <linux/major.h>
 #include <linux/errno.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/kobject.h>
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index b48689839428..e85b1e4389e0 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,10 @@
+Version 1.60
+-------------
+Fix memory leak in reconnect.  Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again.  Add noforcegid
+and noforceuid mount parameters.
 Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
@@ -5,7 +12,11 @@ client generated ones by default (mount option "serverino" turned
 on by default if server supports it).  Add forceuid and forcegid
 mount options (so that when negotiating unix extensions specifying
 which uid mounted does not immediately force the server's reported
-uids to be overridden).
+uids to be overridden).  Add support for scope mount parm. Improve
+hard link detection to use same inode for both.  Do not set
+read-only dos attribute on directories (for chmod) since Windows
+explorer special cases this attribute bit for directories for
+a different purpose.
 Version 1.58
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index ad92921dbde4..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,11 +262,11 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  forceuid      Set the default uid for inodes based on the uid
+  forceuid      Set the default uid for inodes to the uid
-                passed in. For mounts to servers
+                passed in on mount. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
-                the uid, gid and mode so this parameter should  not be
+                the uid, gid and mode so this parameter should not be
                specified unless the server and clients uid and gid
                numbering differ.  If the server and client are in the
                same domain (e.g. running winbind or nss_ldap) and
@@ -278,11 +278,7 @@ A partial list of the supported mount options follows:
                of existing files will be the uid (gid) of the person
                who executed the mount (root, except when mount.cifs
                is configured setuid for user mounts) unless the "uid=" 
-                (gid) mount option is specified.  For the uid (gid) of newly
+                (gid) mount option is specified. Also note that permission
-                created files and directories, ie files created since 
-                the last mount of the server share, the expected uid 
-                (gid) is cached as long as the inode remains in 
-                memory on the client.   Also note that permission
                checks (authorization checks) on accesses to a file occur
                at the server, but there are cases in which an administrator
                may want to restrict at the client as well.  For those
@@ -290,12 +286,15 @@ A partial list of the supported mount options follows:
                (such as Windows), permissions can also be checked at the
                client, and a crude form of client side permission checking 
                can be enabled by specifying file_mode and dir_mode on 
-                the client.  Note that the mount.cifs helper must be
+                the client.  (default)
-                at version 1.10 or higher to support specifying the uid
+  forcegid      (similar to above but for the groupid instead of uid) (default)
-                (or gid) in non-numeric form.
+  noforceuid    Fill in file owner information (uid) by requesting it from
-  forcegid      (similar to above but for the groupid instead of uid)
+                the server if possible. With this option, the value given in
+                the uid= option (on mount) will only be used if the server
+                can not support returning uids on inodes.
+  noforcegid    (similar to above but for the group owner, gid, instead of uid)
  uid           Set the default uid for inodes, and indicate to the
-                cifs kernel driver which local user mounted . If the server
+                cifs kernel driver which local user mounted. If the server
                supports the unix extensions the default uid is
                not used to fill in the owner fields of inodes (files)
                unless the "forceuid" parameter is specified.
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 1b09f1670061..20692fbfdb24 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -49,6 +49,7 @@
 #define ASN1_OJI        6       /* Object Identifier  */
 #define ASN1_OJD        7       /* Object Description */
 #define ASN1_EXT        8       /* External */
+#define ASN1_ENUM       10      /* Enumerated */
 #define ASN1_SEQ        16      /* Sequence */
 #define ASN1_SET        17      /* Set */
 #define ASN1_NUMSTR     18      /* Numerical String */
@@ -78,10 +79,12 @@
 #define SPNEGO_OID_LEN 7
 #define NTLMSSP_OID_LEN  10
 #define KRB5_OID_LEN  7
+#define KRB5U2U_OID_LEN  8
 #define MSKRB5_OID_LEN  7
 static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
 static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
 static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
+static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
 static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
 /*
@@ -122,6 +125,28 @@ asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
        return 1;
 }
+#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */
+static unsigned char
+asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
+{
+        unsigned char ch;
+        if (ctx->pointer >= ctx->end) {
+                ctx->error = ASN1_ERR_DEC_EMPTY;
+                return 0;
+        }
+        ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */
+        if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
+                *val = *(++(ctx->pointer)); /* value has enum value */
+        else
+                return 0;
+        ctx->pointer++;
+        return 1;
+}
+#endif
 static unsigned char
 asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
 {
@@ -476,10 +501,9 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        unsigned int cls, con, tag, oidlen, rc;
        bool use_ntlmssp = false;
        bool use_kerberos = false;
+        bool use_kerberosu2u = false;
        bool use_mskerberos = false;
-        *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
        /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
        asn1_open(&ctx, security_blob, length);
@@ -515,6 +539,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* SPNEGO */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding negTokenInit"));
                return 0;
@@ -526,6 +551,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* negTokenInit */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding negTokenInit"));
                return 0;
@@ -537,6 +563,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
                return 0;
@@ -548,6 +575,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* sequence of */
        if (asn1_header_decode
            (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
@@ -560,6 +588,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* list of security mechanisms */
        while (!asn1_eoc_decode(&ctx, sequence_end)) {
                rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
                if (!rc) {
@@ -576,11 +605,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                                if (compare_oid(oid, oidlen, MSKRB5_OID,
                                                MSKRB5_OID_LEN) &&
-                                                !use_kerberos)
+                                                !use_mskerberos)
                                        use_mskerberos = true;
+                                else if (compare_oid(oid, oidlen, KRB5U2U_OID,
+                                                     KRB5U2U_OID_LEN) &&
+                                                     !use_kerberosu2u)
+                                        use_kerberosu2u = true;
                                else if (compare_oid(oid, oidlen, KRB5_OID,
                                                     KRB5_OID_LEN) &&
-                                                     !use_mskerberos)
+                                                     !use_kerberos)
                                        use_kerberos = true;
                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
                                                     NTLMSSP_OID_LEN))
@@ -593,7 +626,12 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                }
        }
+        /* mechlistMIC */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+                /* Check if we have reached the end of the blob, but with
+                   no mechListMic (e.g. NTLMSSP instead of KRB5) */
+                if (ctx.error == ASN1_ERR_DEC_EMPTY)
+                        goto decode_negtoken_exit;
                cFYI(1, ("Error decoding last part negTokenInit exit3"));
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
@@ -602,6 +640,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                         cls, con, tag, end, *end));
                return 0;
        }
+        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding last part negTokenInit exit5"));
                return 0;
@@ -611,6 +651,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                        cls, con, tag, end, *end));
        }
+        /* sequence of */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding last part negTokenInit exit 7"));
                return 0;
@@ -619,6 +660,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                         cls, con, tag, end, *end));
                return 0;
        }
+        /* general string */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding last part negTokenInit exit9"));
                return 0;
@@ -630,13 +673,13 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        }
        cFYI(1, ("Need to call asn1_octets_decode() function for %s",
                 ctx.pointer)); /* is this UTF-8 or ASCII? */
+decode_negtoken_exit:
        if (use_kerberos)
                *secType = Kerberos;
        else if (use_mskerberos)
                *secType = MSKerberos;
        else if (use_ntlmssp)
-                *secType = NTLMSSP;
+                *secType = RawNTLMSSP;
        return 1;
 }
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7f19fefd3d45..42cec2a7c0cf 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -261,6 +261,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                                        atomic_set(&tcon->num_reads, 0);
                                        atomic_set(&tcon->num_oplock_brks, 0);
                                        atomic_set(&tcon->num_opens, 0);
+                                        atomic_set(&tcon->num_posixopens, 0);
+                                        atomic_set(&tcon->num_posixmkdirs, 0);
                                        atomic_set(&tcon->num_closes, 0);
                                        atomic_set(&tcon->num_deletes, 0);
                                        atomic_set(&tcon->num_mkdirs, 0);
@@ -347,11 +349,15 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                                        atomic_read(&tcon->num_locks),
                                        atomic_read(&tcon->num_hardlinks),
                                        atomic_read(&tcon->num_symlinks));
-                                seq_printf(m, "\nOpens: %d Closes: %d"
+                                seq_printf(m, "\nOpens: %d Closes: %d "
                                              "Deletes: %d",
                                        atomic_read(&tcon->num_opens),
                                        atomic_read(&tcon->num_closes),
                                        atomic_read(&tcon->num_deletes));
+                                seq_printf(m, "\nPosix Opens: %d "
+                                              "Posix Mkdirs: %d",
+                                        atomic_read(&tcon->num_posixopens),
+                                        atomic_read(&tcon->num_posixmkdirs));
                                seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
                                        atomic_read(&tcon->num_mkdirs),
                                        atomic_read(&tcon->num_rmdirs));
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a8..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
 * i.e. strips from UNC trailing path that is not part of share
 * name and fixup missing '\' in the begining of DFS node refferal
 * if neccessary.
- * Returns pointer to share name on success or NULL on error.
+ * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
 */
 static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
        UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
                         GFP_KERNEL);
        if (!UNC)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        /* get share name and server name */
        if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
                cERROR(1, ("%s: no server name end in node name: %s",
                        __func__, node_name));
                kfree(UNC);
-                return NULL;
+                return ERR_PTR(-EINVAL);
        }
        /* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                return ERR_PTR(-EINVAL);
        *devname = cifs_get_share_name(ref->node_name);
+        if (IS_ERR(*devname)) {
+                rc = PTR_ERR(*devname);
+                *devname = NULL;
+                goto compose_mount_options_err;
+        }
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 4a4581cb2b5e..051caecf7d67 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -86,6 +86,9 @@ struct key_type cifs_spnego_key_type = {
 /* strlen of ";user=" */
 #define USER_KEY_LEN            6
+/* strlen of ";pid=0x" */
+#define PID_KEY_LEN             7
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -103,7 +106,8 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
                   IP_KEY_LEN + INET6_ADDRSTRLEN +
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
-                   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
+                   USER_KEY_LEN + strlen(sesInfo->userName) +
+                   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
        spnego_key = ERR_PTR(-ENOMEM);
        description = kzalloc(desc_len, GFP_KERNEL);
@@ -141,6 +145,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        sprintf(dp, ";user=%s", sesInfo->userName);
+        dp = description + strlen(description);
+        sprintf(dp, ";pid=0x%x", current->pid);
        cFYI(1, ("key description = %s", description));
        spnego_key = request_key(&cifs_spnego_key_type, description, "");
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
-        for (i = 0; from[i] && i < maxwords; i++) {
+        for (i = 0; i < maxwords && from[i]; i++) {
                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
                                             NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1403b5d86a73..6941c22398a6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -327,7 +327,7 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
-                       struct inode *inode)
+                       struct cifs_fattr *fattr)
 {
        int i;
        int num_aces = 0;
@@ -340,7 +340,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        if (!pdacl) {
                /* no DACL in the security descriptor, set
                   all the permissions for user/group/other */
-                inode->i_mode |= S_IRWXUGO;
+                fattr->cf_mode |= S_IRWXUGO;
                return;
        }
@@ -357,7 +357,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        /* reset rwx permissions for user/group/other.
           Also, if num_aces is 0 i.e. DACL has no ACEs,
           user/group/other have no permissions */
-        inode->i_mode &= ~(S_IRWXUGO);
+        fattr->cf_mode &= ~(S_IRWXUGO);
        acl_base = (char *)pdacl;
        acl_size = sizeof(struct cifs_acl);
@@ -379,17 +379,17 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                        if (compare_sids(&(ppace[i]->sid), pownersid))
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
-                                                     &(inode->i_mode),
+                                                     &fattr->cf_mode,
                                                     &user_mask);
                        if (compare_sids(&(ppace[i]->sid), pgrpsid))
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
-                                                     &(inode->i_mode),
+                                                     &fattr->cf_mode,
                                                     &group_mask);
                        if (compare_sids(&(ppace[i]->sid), &sid_everyone))
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
-                                                     &(inode->i_mode),
+                                                     &fattr->cf_mode,
                                                     &other_mask);
 /*                      memcpy((void *)(&(cifscred->aces[i])),
@@ -464,7 +464,7 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 /* Convert CIFS ACL to POSIX form */
 static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
-                          struct inode *inode)
+                          struct cifs_fattr *fattr)
 {
        int rc;
        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
@@ -472,7 +472,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
        char *end_of_acl = ((char *)pntsd) + acl_len;
        __u32 dacloffset;
-        if ((inode == NULL) || (pntsd == NULL))
+        if (pntsd == NULL)
                return -EIO;
        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
@@ -497,7 +497,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
        if (dacloffset)
                parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
-                           group_sid_ptr, inode);
+                           group_sid_ptr, fattr);
        else
                cFYI(1, ("no ACL")); /* BB grant all or default perms? */
@@ -508,7 +508,6 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
        memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
                        sizeof(struct cifs_sid)); */
        return 0;
 }
@@ -671,8 +670,9 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+void
-                     const char *path, const __u16 *pfid)
+cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+                  struct inode *inode, const char *path, const __u16 *pfid)
 {
        struct cifs_ntsd *pntsd = NULL;
        u32 acllen = 0;
@@ -687,7 +687,7 @@ void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
        if (pntsd)
-                rc = parse_sec_desc(pntsd, acllen, inode);
+                rc = parse_sec_desc(pntsd, acllen, fattr);
        if (rc)
                cFYI(1, ("parse sec desc failed rc = %d", rc));
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0d92114195ab..84b75253b05a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -308,7 +308,6 @@ cifs_alloc_inode(struct super_block *sb)
        if (!cifs_inode)
                return NULL;
        cifs_inode->cifsAttrs = 0x20;   /* default */
-        atomic_set(&cifs_inode->inUse, 0);
        cifs_inode->time = 0;
        cifs_inode->write_behind_rc = 0;
        /* Until the file is open and we have gotten oplock
@@ -333,6 +332,27 @@ cifs_destroy_inode(struct inode *inode)
        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
 }
+static void
+cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
+{
+        seq_printf(s, ",addr=");
+        switch (server->addr.sockAddr.sin_family) {
+        case AF_INET:
+                seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+                break;
+        case AF_INET6:
+                seq_printf(s, "%pI6",
+                           &server->addr.sockAddr6.sin6_addr.s6_addr);
+                if (server->addr.sockAddr6.sin6_scope_id)
+                        seq_printf(s, "%%%u",
+                                   server->addr.sockAddr6.sin6_scope_id);
+                break;
+        default:
+                seq_printf(s, "(unknown)");
+        }
+}
 /*
 * cifs_show_options() is for displaying mount options in /proc/mounts.
 * Not all settable options are displayed but most of the important
@@ -343,83 +363,68 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
-        struct TCP_Server_Info *server;
        cifs_sb = CIFS_SB(m->mnt_sb);
+        tcon = cifs_sb->tcon;
-        if (cifs_sb) {
+        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
-                tcon = cifs_sb->tcon;
+        if (tcon->ses->userName)
-                if (tcon) {
+                seq_printf(s, ",username=%s", tcon->ses->userName);
-                        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+        if (tcon->ses->domainName)
-                        if (tcon->ses) {
+                seq_printf(s, ",domain=%s", tcon->ses->domainName);
-                                if (tcon->ses->userName)
-                                        seq_printf(s, ",username=%s",
+        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
-                                           tcon->ses->userName);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-                                if (tcon->ses->domainName)
+                seq_printf(s, ",forceuid");
-                                        seq_printf(s, ",domain=%s",
+        else
-                                           tcon->ses->domainName);
+                seq_printf(s, ",noforceuid");
-                                server = tcon->ses->server;
-                                if (server) {
+        seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
-                                        seq_printf(s, ",addr=");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-                                        switch (server->addr.sockAddr6.
+                seq_printf(s, ",forcegid");
-                                                sin6_family) {
+        else
-                                        case AF_INET6:
+                seq_printf(s, ",noforcegid");
-                                                seq_printf(s, "%pI6",
-                                                           &server->addr.sockAddr6.sin6_addr);
+        cifs_show_address(s, tcon->ses->server);
-                                                break;
-                                        case AF_INET:
+        if (!tcon->unix_ext)
-                                                seq_printf(s, "%pI4",
+                seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
-                                                           &server->addr.sockAddr.sin_addr.s_addr);
-                                                break;
-                                        }
-                                }
-                        }
-                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
-                           !(tcon->unix_ext))
-                                seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
-                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
-                           !(tcon->unix_ext))
-                                seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
-                        if (!tcon->unix_ext) {
-                                seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
                                           cifs_sb->mnt_file_mode,
                                           cifs_sb->mnt_dir_mode);
-                        }
+        if (tcon->seal)
-                        if (tcon->seal)
+                seq_printf(s, ",seal");
-                                seq_printf(s, ",seal");
+        if (tcon->nocase)
-                        if (tcon->nocase)
+                seq_printf(s, ",nocase");
-                                seq_printf(s, ",nocase");
+        if (tcon->retry)
-                        if (tcon->retry)
+                seq_printf(s, ",hard");
-                                seq_printf(s, ",hard");
+        if (cifs_sb->prepath)
-                }
+                seq_printf(s, ",prepath=%s", cifs_sb->prepath);
-                if (cifs_sb->prepath)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
-                        seq_printf(s, ",prepath=%s", cifs_sb->prepath);
+                seq_printf(s, ",posixpaths");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
-                        seq_printf(s, ",posixpaths");
+                seq_printf(s, ",setuids");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-                        seq_printf(s, ",setuids");
+                seq_printf(s, ",serverino");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
-                        seq_printf(s, ",serverino");
+                seq_printf(s, ",directio");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
-                        seq_printf(s, ",directio");
+                seq_printf(s, ",nouser_xattr");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
-                        seq_printf(s, ",nouser_xattr");
+                seq_printf(s, ",mapchars");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
-                        seq_printf(s, ",mapchars");
+                seq_printf(s, ",sfu");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                        seq_printf(s, ",sfu");
+                seq_printf(s, ",nobrl");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
-                        seq_printf(s, ",nobrl");
+                seq_printf(s, ",cifsacl");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
-                        seq_printf(s, ",cifsacl");
+                seq_printf(s, ",dynperm");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+        if (m->mnt_sb->s_flags & MS_POSIXACL)
-                        seq_printf(s, ",dynperm");
+                seq_printf(s, ",acl");
-                if (m->mnt_sb->s_flags & MS_POSIXACL)
-                        seq_printf(s, ",acl");
+        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
+        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
-                seq_printf(s, ",rsize=%d", cifs_sb->rsize);
-                seq_printf(s, ",wsize=%d", cifs_sb->wsize);
-        }
        return 0;
 }
@@ -535,9 +540,14 @@ static void cifs_umount_begin(struct super_block *sb)
        if (tcon == NULL)
                return;
-        lock_kernel();
        read_lock(&cifs_tcp_ses_lock);
-        if (tcon->tc_count == 1)
+        if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
+                /* we have other mounts to same share or we have
+                   already tried to force umount this and woken up
+                   all waiting network requests, nothing to do */
+                read_unlock(&cifs_tcp_ses_lock);
+                return;
+        } else if (tcon->tc_count == 1)
                tcon->tidStatus = CifsExiting;
        read_unlock(&cifs_tcp_ses_lock);
@@ -552,9 +562,7 @@ static void cifs_umount_begin(struct super_block *sb)
                wake_up_all(&tcon->ses->server->response_q);
                msleep(1);
        }
-/* BB FIXME - finish add checks for tidStatus BB */
-        unlock_kernel();
        return;
 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 9570a0e8023f..6c170948300d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -24,6 +24,19 @@
 #define ROOT_I 2
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static inline ino_t
+cifs_uniqueid_to_ino_t(u64 fileid)
+{
+        ino_t ino = (ino_t) fileid;
+        if (sizeof(ino_t) < sizeof(u64))
+                ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        return ino;
+}
 extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
@@ -100,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.59"
+#define CIFS_VERSION   "1.60"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a61ab772c6f6..6084d6379c03 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -83,7 +83,7 @@ enum securityEnum {
        NTLM,                   /* Legacy NTLM012 auth with NTLM hash */
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
-        NTLMSSP,                /* NTLMSSP via SPNEGO, NTLMv2 hash */
+/*      NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
        Kerberos,               /* Kerberos via SPNEGO */
        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
@@ -260,6 +260,8 @@ struct cifsTconInfo {
        atomic_t num_closes;
        atomic_t num_deletes;
        atomic_t num_mkdirs;
+        atomic_t num_posixopens;
+        atomic_t num_posixmkdirs;
        atomic_t num_rmdirs;
        atomic_t num_renames;
        atomic_t num_t2renames;
@@ -364,13 +366,13 @@ struct cifsInodeInfo {
        struct list_head openFileList;
        int write_behind_rc;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-        atomic_t inUse;  /* num concurrent users (local openers cifs) of file*/
        unsigned long time;     /* jiffies of last update/check of inode */
        bool clientCanCacheRead:1;      /* read oplock */
        bool clientCanCacheAll:1;       /* read and writebehind oplock */
        bool oplockPending:1;
        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
        u64  server_eof;                /* current file size on server */
+        u64  uniqueid;                  /* server inode number */
        struct inode vfs_inode;
 };
@@ -472,6 +474,32 @@ struct dfs_info3_param {
        char *node_name;
 };
+/*
+ * common struct for holding inode info when searching for or updating an
+ * inode with new info
+ */
+#define CIFS_FATTR_DFS_REFERRAL         0x1
+#define CIFS_FATTR_DELETE_PENDING       0x2
+#define CIFS_FATTR_NEED_REVAL           0x4
+struct cifs_fattr {
+        u32             cf_flags;
+        u32             cf_cifsattrs;
+        u64             cf_uniqueid;
+        u64             cf_eof;
+        u64             cf_bytes;
+        uid_t           cf_uid;
+        gid_t           cf_gid;
+        umode_t         cf_mode;
+        dev_t           cf_rdev;
+        unsigned int    cf_nlink;
+        unsigned int    cf_dtype;
+        struct timespec cf_atime;
+        struct timespec cf_mtime;
+        struct timespec cf_ctime;
+};
 static inline void free_dfs_info_param(struct dfs_info3_param *param)
 {
        if (param) {
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index a785f69dbc9f..2d07f890a842 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2328,19 +2328,7 @@ struct file_attrib_tag {
 typedef struct {
        __le32 NextEntryOffset;
        __u32 ResumeKey; /* as with FileIndex - no need to convert */
-        __le64 EndOfFile;
+        FILE_UNIX_BASIC_INFO basic;
-        __le64 NumOfBytes;
-        __le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */
-        __le64 LastAccessTime;
-        __le64 LastModificationTime;
-        __le64 Uid;
-        __le64 Gid;
-        __le32 Type;
-        __le64 DevMajor;
-        __le64 DevMinor;
-        __le64 UniqueId;
-        __le64 Permissions;
-        __le64 Nlinks;
        char FileName[1];
 } __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f9452329bcce..da8fbf565991 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -74,7 +74,7 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
                        enum securityEnum *secType);
-extern int cifs_inet_pton(const int, const char *source, void *dst);
+extern int cifs_convert_address(char *src, void *dst);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifsTconInfo *, int /* length of
@@ -98,9 +98,13 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
                           struct super_block *sb, int mode, int oflags,
                           int *poplock, __u16 *pnetfid, int xid);
-extern void posix_fill_in_inode(struct inode *tmp_inode,
+extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
-                                FILE_UNIX_BASIC_INFO *pData, int isNewInode);
+                                     FILE_UNIX_BASIC_INFO *info,
-extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
+                                     struct cifs_sb_info *cifs_sb);
+extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
+extern struct inode *cifs_iget(struct super_block *sb,
+                               struct cifs_fattr *fattr);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path,
                        FILE_ALL_INFO *pfile_info,
@@ -108,8 +112,9 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
-                            const char *path, const __u16 *pfid);
+                              struct cifs_fattr *fattr, struct inode *inode,
+                              const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
@@ -215,7 +220,11 @@ struct cifs_unix_set_info_args {
        dev_t   device;
 };
-extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon,
+extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+                                  const struct cifs_unix_set_info_args *args,
+                                  u16 fid, u32 pid_of_opener);
+extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *pTcon,
                        char *fileName,
                        const struct cifs_unix_set_info_args *args,
                        const struct nls_table *nls_codepage,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b84c61d5bca4..1866bc2927d4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -594,7 +594,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        else if (secFlags & CIFSSEC_MAY_KRB5)
                server->secType = Kerberos;
        else if (secFlags & CIFSSEC_MAY_NTLMSSP)
-                server->secType = NTLMSSP;
+                server->secType = RawNTLMSSP;
        else if (secFlags & CIFSSEC_MAY_LANMAN)
                server->secType = LANMAN;
 /* #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -729,7 +729,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
         * the tcon is no longer on the list, so no need to take lock before
         * checking this.
         */
-        if (tcon->need_reconnect)
+        if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
                return 0;
        rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
@@ -1113,7 +1113,10 @@ PsxCreat:
 psx_create_err:
        cifs_buf_release(pSMB);
-        cifs_stats_inc(&tcon->num_mkdirs);
+        if (posix_flags & SMB_O_DIRECTORY)
+                cifs_stats_inc(&tcon->num_posixmkdirs);
+        else
+                cifs_stats_inc(&tcon->num_posixopens);
        if (rc == -EAGAIN)
                goto PsxCreat;
@@ -5074,10 +5077,114 @@ SetAttrLgcyRetry:
 }
 #endif /* temporarily unneeded SetAttr legacy function */
+static void
+cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
+                        const struct cifs_unix_set_info_args *args)
+{
+        u64 mode = args->mode;
+        /*
+         * Samba server ignores set of file size to zero due to bugs in some
+         * older clients, but we should be precise - we use SetFileSize to
+         * set file size and do not want to truncate file size to zero
+         * accidently as happened on one Samba server beta by putting
+         * zero instead of -1 here
+         */
+        data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
+        data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
+        data_offset->LastStatusChange = cpu_to_le64(args->ctime);
+        data_offset->LastAccessTime = cpu_to_le64(args->atime);
+        data_offset->LastModificationTime = cpu_to_le64(args->mtime);
+        data_offset->Uid = cpu_to_le64(args->uid);
+        data_offset->Gid = cpu_to_le64(args->gid);
+        /* better to leave device as zero when it is  */
+        data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
+        data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
+        data_offset->Permissions = cpu_to_le64(mode);
+        if (S_ISREG(mode))
+                data_offset->Type = cpu_to_le32(UNIX_FILE);
+        else if (S_ISDIR(mode))
+                data_offset->Type = cpu_to_le32(UNIX_DIR);
+        else if (S_ISLNK(mode))
+                data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
+        else if (S_ISCHR(mode))
+                data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
+        else if (S_ISBLK(mode))
+                data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
+        else if (S_ISFIFO(mode))
+                data_offset->Type = cpu_to_le32(UNIX_FIFO);
+        else if (S_ISSOCK(mode))
+                data_offset->Type = cpu_to_le32(UNIX_SOCKET);
+}
 int
-CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
-                   const struct cifs_unix_set_info_args *args,
+                       const struct cifs_unix_set_info_args *args,
-                   const struct nls_table *nls_codepage, int remap)
+                       u16 fid, u32 pid_of_opener)
+{
+        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+        FILE_UNIX_BASIC_INFO *data_offset;
+        int rc = 0;
+        u16 params, param_offset, offset, byte_count, count;
+        cFYI(1, ("Set Unix Info (via SetFileInfo)"));
+        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+        if (rc)
+                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+        params = 6;
+        pSMB->MaxSetupCount = 0;
+        pSMB->Reserved = 0;
+        pSMB->Flags = 0;
+        pSMB->Timeout = 0;
+        pSMB->Reserved2 = 0;
+        param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+        offset = param_offset + params;
+        data_offset = (FILE_UNIX_BASIC_INFO *)
+                                ((char *)(&pSMB->hdr.Protocol) + offset);
+        count = sizeof(FILE_UNIX_BASIC_INFO);
+        pSMB->MaxParameterCount = cpu_to_le16(2);
+        /* BB find max SMB PDU from sess */
+        pSMB->MaxDataCount = cpu_to_le16(1000);
+        pSMB->SetupCount = 1;
+        pSMB->Reserved3 = 0;
+        pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+        byte_count = 3 /* pad */  + params + count;
+        pSMB->DataCount = cpu_to_le16(count);
+        pSMB->ParameterCount = cpu_to_le16(params);
+        pSMB->TotalDataCount = pSMB->DataCount;
+        pSMB->TotalParameterCount = pSMB->ParameterCount;
+        pSMB->ParameterOffset = cpu_to_le16(param_offset);
+        pSMB->DataOffset = cpu_to_le16(offset);
+        pSMB->Fid = fid;
+        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
+        pSMB->Reserved4 = 0;
+        pSMB->hdr.smb_buf_length += byte_count;
+        pSMB->ByteCount = cpu_to_le16(byte_count);
+        cifs_fill_unix_set_info(data_offset, args);
+        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+        if (rc)
+                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+        /* Note: On -EAGAIN error only caller can retry on handle based calls
+                since file handle passed in no longer valid */
+        return rc;
+}
+int
+CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+                       const struct cifs_unix_set_info_args *args,
+                       const struct nls_table *nls_codepage, int remap)
 {
        TRANSACTION2_SPI_REQ *pSMB = NULL;
        TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5086,7 +5193,6 @@ CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
        int bytes_returned = 0;
        FILE_UNIX_BASIC_INFO *data_offset;
        __u16 params, param_offset, offset, count, byte_count;
-        __u64 mode = args->mode;
        cFYI(1, ("In SetUID/GID/Mode"));
 setPermsRetry:
@@ -5137,38 +5243,8 @@ setPermsRetry:
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
        pSMB->Reserved4 = 0;
        pSMB->hdr.smb_buf_length += byte_count;
-        /* Samba server ignores set of file size to zero due to bugs in some
-        older clients, but we should be precise - we use SetFileSize to
-        set file size and do not want to truncate file size to zero
-        accidently as happened on one Samba server beta by putting
-        zero instead of -1 here */
-        data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
-        data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
-        data_offset->LastStatusChange = cpu_to_le64(args->ctime);
-        data_offset->LastAccessTime = cpu_to_le64(args->atime);
-        data_offset->LastModificationTime = cpu_to_le64(args->mtime);
-        data_offset->Uid = cpu_to_le64(args->uid);
-        data_offset->Gid = cpu_to_le64(args->gid);
-        /* better to leave device as zero when it is  */
-        data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
-        data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
-        data_offset->Permissions = cpu_to_le64(mode);
-        if (S_ISREG(mode))
-                data_offset->Type = cpu_to_le32(UNIX_FILE);
-        else if (S_ISDIR(mode))
-                data_offset->Type = cpu_to_le32(UNIX_DIR);
-        else if (S_ISLNK(mode))
-                data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
-        else if (S_ISCHR(mode))
-                data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
-        else if (S_ISBLK(mode))
-                data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
-        else if (S_ISFIFO(mode))
-                data_offset->Type = cpu_to_le32(UNIX_FIFO);
-        else if (S_ISSOCK(mode))
-                data_offset->Type = cpu_to_le32(UNIX_SOCKET);
+        cifs_fill_unix_set_info(data_offset, args);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 97f4311b9a8e..1f3345d7fa79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -70,7 +70,6 @@ struct smb_vol {
        mode_t file_mode;
        mode_t dir_mode;
        unsigned secFlg;
-        bool rw:1;
        bool retry:1;
        bool intr:1;
        bool setuids:1;
@@ -804,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
        char *data;
        unsigned int  temp_len, i, j;
        char separator[2];
+        short int override_uid = -1;
+        short int override_gid = -1;
+        bool uid_specified = false;
+        bool gid_specified = false;
        separator[0] = ',';
        separator[1] = 0;
@@ -832,7 +835,6 @@ cifs_parse_mount_options(char *options, const char *devname,
        vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
        /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
-        vol->rw = true;
        /* default is always to request posix paths. */
        vol->posix_paths = 1;
        /* default to using server inode numbers where available */
@@ -1095,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                                    "too long.\n");
                                return 1;
                        }
-                } else if (strnicmp(data, "uid", 3) == 0) {
+                } else if (!strnicmp(data, "uid", 3) && value && *value) {
-                        if (value && *value)
+                        vol->linux_uid = simple_strtoul(value, &value, 0);
-                                vol->linux_uid =
+                        uid_specified = true;
-                                        simple_strtoul(value, &value, 0);
+                } else if (!strnicmp(data, "forceuid", 8)) {
-                } else if (strnicmp(data, "forceuid", 8) == 0) {
+                        override_uid = 1;
-                                vol->override_uid = 1;
+                } else if (!strnicmp(data, "noforceuid", 10)) {
-                } else if (strnicmp(data, "gid", 3) == 0) {
+                        override_uid = 0;
-                        if (value && *value)
+                } else if (!strnicmp(data, "gid", 3) && value && *value) {
-                                vol->linux_gid =
+                        vol->linux_gid = simple_strtoul(value, &value, 0);
-                                        simple_strtoul(value, &value, 0);
+                        gid_specified = true;
-                } else if (strnicmp(data, "forcegid", 8) == 0) {
+                } else if (!strnicmp(data, "forcegid", 8)) {
-                                vol->override_gid = 1;
+                        override_gid = 1;
+                } else if (!strnicmp(data, "noforcegid", 10)) {
+                        override_gid = 0;
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1199,7 +1203,9 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (strnicmp(data, "guest", 5) == 0) {
                        /* ignore */
                } else if (strnicmp(data, "rw", 2) == 0) {
-                        vol->rw = true;
+                        /* ignore */
+                } else if (strnicmp(data, "ro", 2) == 0) {
+                        /* ignore */
                } else if (strnicmp(data, "noblocksend", 11) == 0) {
                        vol->noblocksnd = 1;
                } else if (strnicmp(data, "noautotune", 10) == 0) {
@@ -1218,8 +1224,6 @@ cifs_parse_mount_options(char *options, const char *devname,
                            parse these options again and set anything and it
                            is ok to just ignore them */
                        continue;
-                } else if (strnicmp(data, "ro", 2) == 0) {
-                        vol->rw = false;
                } else if (strnicmp(data, "hard", 4) == 0) {
                        vol->retry = 1;
                } else if (strnicmp(data, "soft", 4) == 0) {
@@ -1357,6 +1361,18 @@ cifs_parse_mount_options(char *options, const char *devname,
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
+        if (uid_specified)
+                vol->override_uid = override_uid;
+        else if (override_uid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+                                   "specified with no uid= option.\n");
+        if (gid_specified)
+                vol->override_gid = override_gid;
+        else if (override_gid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+                                   "specified with no gid= option.\n");
        return 0;
 }
@@ -1386,8 +1402,10 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
                     server->addr.sockAddr.sin_addr.s_addr))
                        continue;
                else if (addr->ss_family == AF_INET6 &&
-                         !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+                         (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
-                                          &addr6->sin6_addr))
+                                           &addr6->sin6_addr) ||
+                          server->addr.sockAddr6.sin6_scope_id !=
+                                           addr6->sin6_scope_id))
                        continue;
                ++server->srv_count;
@@ -1433,28 +1451,15 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        memset(&addr, 0, sizeof(struct sockaddr_storage));
-        if (volume_info->UNCip && volume_info->UNC) {
+        cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
-                rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
-                                    &sin_server->sin_addr.s_addr);
-                if (rc <= 0) {
-                        /* not ipv4 address, try ipv6 */
-                        rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
-                                            &sin_server6->sin6_addr.in6_u);
-                        if (rc > 0)
-                                addr.ss_family = AF_INET6;
-                } else {
-                        addr.ss_family = AF_INET;
-                }
-                if (rc <= 0) {
+        if (volume_info->UNCip && volume_info->UNC) {
+                rc = cifs_convert_address(volume_info->UNCip, &addr);
+                if (!rc) {
                        /* we failed translating address */
                        rc = -EINVAL;
                        goto out_err;
                }
-                cFYI(1, ("UNC: %s ip: %s", volume_info->UNC,
-                         volume_info->UNCip));
        } else if (volume_info->UNCip) {
                /* BB using ip addr as tcp_ses name to connect to the
                   DFS root below */
@@ -1513,14 +1518,14 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                cFYI(1, ("attempting ipv6 connect"));
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
+                sin_server6->sin6_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
                        sizeof(struct sockaddr_in6));
-                sin_server6->sin6_port = htons(volume_info->port);
                rc = ipv6_connect(tcp_ses);
        } else {
+                sin_server->sin_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr, sin_server,
                        sizeof(struct sockaddr_in));
-                sin_server->sin_port = htons(volume_info->port);
                rc = ipv4_connect(tcp_ses);
        }
        if (rc < 0) {
@@ -2465,10 +2470,10 @@ try_mount_again:
                tcon->local_lease = volume_info->local_lease;
        }
        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
+                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = (u64) 1 << 63;
+                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                } else
+                else
-                        sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */
+                        sb->s_maxbytes = MAX_NON_LFS;
        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
@@ -2557,11 +2562,20 @@ remote_path_check:
                        if (mount_data != mount_data_global)
                                kfree(mount_data);
                        mount_data = cifs_compose_mount_options(
                                        cifs_sb->mountdata, full_path + 1,
                                        referrals, &fake_devname);
-                        kfree(fake_devname);
                        free_dfs_info_array(referrals, num_referrals);
+                        kfree(fake_devname);
+                        kfree(full_path);
+                        if (IS_ERR(mount_data)) {
+                                rc = PTR_ERR(mount_data);
+                                mount_data = NULL;
+                                goto mount_fail_check;
+                        }
                        if (tcon)
                                cifs_put_tcon(tcon);
@@ -2569,8 +2583,6 @@ remote_path_check:
                                cifs_put_smb_ses(pSesInfo);
                        cleanup_volume_info(&volume_info);
-                        FreeXid(xid);
-                        kfree(full_path);
                        referral_walks_count++;
                        goto try_mount_again;
                }
@@ -2739,6 +2751,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
                /* mostly informational -- no need to fail on error here */
+                kfree(tcon->nativeFileSystem);
                tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
                                                      bytes_left, is_unicode,
                                                      nls_codepage);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3758965d73d5..4326ffd90fa9 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -188,6 +188,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        FILE_UNIX_BASIC_INFO *presp_data;
        __u32 posix_flags = 0;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifs_fattr fattr;
        cFYI(1, ("posix open %s", full_path));
@@ -236,22 +237,21 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        if (presp_data->Type == cpu_to_le32(-1))
                goto posix_open_ret; /* open ok, caller does qpathinfo */
-        /* get new inode and set it up */
        if (!pinode)
                goto posix_open_ret; /* caller does not need info */
+        cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+        /* get new inode and set it up */
        if (*pinode == NULL) {
-                __u64 unique_id = le64_to_cpu(presp_data->UniqueId);
+                *pinode = cifs_iget(sb, &fattr);
-                *pinode = cifs_new_inode(sb, &unique_id);
+                if (!*pinode) {
+                        rc = -ENOMEM;
+                        goto posix_open_ret;
+                }
+        } else {
+                cifs_fattr_to_inode(*pinode, &fattr);
        }
-        /* else an inode was passed in. Update its info, don't create one */
-        /* We do not need to close the file if new_inode fails since
-           the caller will retry qpathinfo as long as inode is null */
-        if (*pinode == NULL)
-                goto posix_open_ret;
-        posix_fill_in_inode(*pinode, presp_data, 1);
        cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
@@ -307,8 +307,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if (oplockEnabled)
@@ -424,9 +425,10 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-                        cifs_sb->local_nls,
+                                        cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                        cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else {
                /* BB implement mode setting via Windows security
                   descriptors e.g. */
@@ -514,10 +516,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path,
+                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
-                        &args, cifs_sb->local_nls,
+                                            cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags &
+                                            cifs_sb->mnt_cifs_flags &
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (!rc) {
                        rc = cifs_get_inode_info_unix(&newinode, full_path,
@@ -540,8 +542,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
                        if (buf == NULL) {
                                kfree(full_path);
+                                rc = -ENOMEM;
                                FreeXid(xid);
-                                return -ENOMEM;
+                                return rc;
                        }
                        rc = CIFSSMBOpen(xid, pTcon, full_path,
@@ -641,6 +644,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                        }
        }
+        /*
+         * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
+         * the VFS handle the create.
+         */
+        if (nd->flags & LOOKUP_EXCL) {
+                d_instantiate(direntry, NULL);
+                return 0;
+        }
        /* can not grab the rename sem here since it would
        deadlock in the cases (beginning of sys_rename itself)
        in which we already have the sb rename sem */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index df4a306f697e..87948147d7ec 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -35,26 +35,11 @@
 *              0 - name is not IP
 */
 static int
-is_ip(const char *name)
+is_ip(char *name)
 {
-        int rc;
+        struct sockaddr_storage ss;
-        struct sockaddr_in sin_server;
-        struct sockaddr_in6 sin_server6;
+        return cifs_convert_address(name, &ss);
-        rc = cifs_inet_pton(AF_INET, name,
-                        &sin_server.sin_addr.s_addr);
-        if (rc <= 0) {
-                /* not ipv4 address, try ipv6 */
-                rc = cifs_inet_pton(AF_INET6, name,
-                                &sin_server6.sin6_addr.in6_u);
-                if (rc > 0)
-                        return 1;
-        } else {
-                return 1;
-        }
-        /* we failed translating address */
-        return 0;
 }
 static int
@@ -72,7 +57,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
        ip[datalen] = '\0';
        /* make sure this looks like an address */
-        if (!is_ip((const char *) ip)) {
+        if (!is_ip(ip)) {
                kfree(ip);
                return -EINVAL;
        }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 06866841b97f..c34b7f8a217b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -300,14 +300,16 @@ int cifs_open(struct inode *inode, struct file *file)
        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
        pCifsFile = cifs_fill_filedata(file);
        if (pCifsFile) {
+                rc = 0;
                FreeXid(xid);
-                return 0;
+                return rc;
        }
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
@@ -446,9 +448,9 @@ int cifs_open(struct inode *inode, struct file *file)
                                .mtime  = NO_CHANGE_64,
                                .device = 0,
                        };
-                        CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-                                            cifs_sb->local_nls,
+                                               cifs_sb->local_nls,
-                                            cifs_sb->mnt_cifs_flags &
+                                               cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                }
        }
@@ -491,11 +493,12 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
                return -EBADF;
        xid = GetXid();
-        mutex_unlock(&pCifsFile->fh_mutex);
+        mutex_lock(&pCifsFile->fh_mutex);
        if (!pCifsFile->invalidHandle) {
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
+                rc = 0;
                FreeXid(xid);
-                return 0;
+                return rc;
        }
        if (file->f_path.dentry == NULL) {
@@ -524,7 +527,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        if (full_path == NULL) {
                rc = -ENOMEM;
 reopen_error_exit:
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return rc;
        }
@@ -566,14 +569,14 @@ reopen_error_exit:
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
                cFYI(1, ("cifs_open returned 0x%x", rc));
                cFYI(1, ("oplock: %d", oplock));
        } else {
 reopen_success:
                pCifsFile->netfid = netfid;
                pCifsFile->invalidHandle = false;
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
                pCifsInode = CIFS_I(inode);
                if (pCifsInode) {
                        if (can_flush) {
@@ -845,8 +848,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        tcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
@@ -1805,8 +1809,9 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
        pTcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        open_file = (struct cifsFileInfo *)file->private_data;
@@ -1885,8 +1890,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        pTcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        open_file = (struct cifsFileInfo *)file->private_data;
@@ -2019,8 +2025,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        xid = GetXid();
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        open_file = (struct cifsFileInfo *)file->private_data;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -2185,8 +2192,9 @@ static int cifs_readpage(struct file *file, struct page *page)
        xid = GetXid();
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        cFYI(1, ("readpage %p at offset %d 0x%x\n",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fad882b075ba..82d83839655e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,239 +77,202 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
        }
 }
-static void cifs_unix_info_to_inode(struct inode *inode,
+/* populate an inode with info from a cifs_fattr struct */
-                FILE_UNIX_BASIC_INFO *info, int force_uid_gid)
+void
+cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 {
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+        unsigned long oldtime = cifs_i->time;
-        __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
-        __u64 end_of_file = le64_to_cpu(info->EndOfFile);
+        inode->i_atime = fattr->cf_atime;
+        inode->i_mtime = fattr->cf_mtime;
+        inode->i_ctime = fattr->cf_ctime;
+        inode->i_rdev = fattr->cf_rdev;
+        inode->i_nlink = fattr->cf_nlink;
+        inode->i_uid = fattr->cf_uid;
+        inode->i_gid = fattr->cf_gid;
+        /* if dynperm is set, don't clobber existing mode */
+        if (inode->i_state & I_NEW ||
+            !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM))
+                inode->i_mode = fattr->cf_mode;
+        cifs_i->cifsAttrs = fattr->cf_cifsattrs;
+        cifs_i->uniqueid = fattr->cf_uniqueid;
+        if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+                cifs_i->time = 0;
+        else
+                cifs_i->time = jiffies;
+        cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
+                 oldtime, cifs_i->time));
-        inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+        cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
-        inode->i_mtime =
-                cifs_NTtimeToUnix(info->LastModificationTime);
+        /*
-        inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
+         * Can't safely change the file size here if the client is writing to
-        inode->i_mode = le64_to_cpu(info->Permissions);
+         * it due to potential races.
+         */
+        spin_lock(&inode->i_lock);
+        if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
+                i_size_write(inode, fattr->cf_eof);
+                /*
+                 * i_blocks is not related to (i_size / i_blksize),
+                 * but instead 512 byte (2**9) size is required for
+                 * calculating num blocks.
+                 */
+                inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
+        }
+        spin_unlock(&inode->i_lock);
+        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+}
+/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
+void
+cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
+                         struct cifs_sb_info *cifs_sb)
+{
+        memset(fattr, 0, sizeof(*fattr));
+        fattr->cf_uniqueid = le64_to_cpu(info->UniqueId);
+        fattr->cf_bytes = le64_to_cpu(info->NumOfBytes);
+        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime);
+        fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
+        fattr->cf_mode = le64_to_cpu(info->Permissions);
        /*
         * Since we set the inode type below we need to mask off
         * to avoid strange results if bits set above.
         */
-        inode->i_mode &= ~S_IFMT;
+        fattr->cf_mode &= ~S_IFMT;
        switch (le32_to_cpu(info->Type)) {
        case UNIX_FILE:
-                inode->i_mode |= S_IFREG;
+                fattr->cf_mode |= S_IFREG;
+                fattr->cf_dtype = DT_REG;
                break;
        case UNIX_SYMLINK:
-                inode->i_mode |= S_IFLNK;
+                fattr->cf_mode |= S_IFLNK;
+                fattr->cf_dtype = DT_LNK;
                break;
        case UNIX_DIR:
-                inode->i_mode |= S_IFDIR;
+                fattr->cf_mode |= S_IFDIR;
+                fattr->cf_dtype = DT_DIR;
                break;
        case UNIX_CHARDEV:
-                inode->i_mode |= S_IFCHR;
+                fattr->cf_mode |= S_IFCHR;
-                inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                fattr->cf_dtype = DT_CHR;
-                                      le64_to_cpu(info->DevMinor) & MINORMASK);
+                fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                                       le64_to_cpu(info->DevMinor) & MINORMASK);
                break;
        case UNIX_BLOCKDEV:
-                inode->i_mode |= S_IFBLK;
+                fattr->cf_mode |= S_IFBLK;
-                inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                fattr->cf_dtype = DT_BLK;
-                                      le64_to_cpu(info->DevMinor) & MINORMASK);
+                fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                                       le64_to_cpu(info->DevMinor) & MINORMASK);
                break;
        case UNIX_FIFO:
-                inode->i_mode |= S_IFIFO;
+                fattr->cf_mode |= S_IFIFO;
+                fattr->cf_dtype = DT_FIFO;
                break;
        case UNIX_SOCKET:
-                inode->i_mode |= S_IFSOCK;
+                fattr->cf_mode |= S_IFSOCK;
+                fattr->cf_dtype = DT_SOCK;
                break;
        default:
                /* safest to call it a file if we do not know */
-                inode->i_mode |= S_IFREG;
+                fattr->cf_mode |= S_IFREG;
+                fattr->cf_dtype = DT_REG;
                cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
                break;
        }
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) &&
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-            !force_uid_gid)
+                fattr->cf_uid = cifs_sb->mnt_uid;
-                inode->i_uid = cifs_sb->mnt_uid;
        else
-                inode->i_uid = le64_to_cpu(info->Uid);
+                fattr->cf_uid = le64_to_cpu(info->Uid);
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) &&
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-            !force_uid_gid)
+                fattr->cf_gid = cifs_sb->mnt_gid;
-                inode->i_gid = cifs_sb->mnt_gid;
        else
-                inode->i_gid = le64_to_cpu(info->Gid);
+                fattr->cf_gid = le64_to_cpu(info->Gid);
-        inode->i_nlink = le64_to_cpu(info->Nlinks);
-        cifsInfo->server_eof = end_of_file;
-        spin_lock(&inode->i_lock);
-        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
-                /*
-                 * We can not safely change the file size here if the client
-                 * is writing to it due to potential races.
-                 */
-                i_size_write(inode, end_of_file);
-                /*
+        fattr->cf_nlink = le64_to_cpu(info->Nlinks);
-                 * i_blocks is not related to (i_size / i_blksize),
-                 * but instead 512 byte (2**9) size is required for
-                 * calculating num blocks.
-                 */
-                inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
-        }
-        spin_unlock(&inode->i_lock);
 }
 /*
- *      Needed to setup inode data for the directory which is the
+ * Fill a cifs_fattr struct with fake inode info.
- *      junction to the new submount (ie to setup the fake directory
- *      which represents a DFS referral)
- */
-static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
-                               struct super_block *sb)
-{
-        struct inode *pinode = NULL;
-        memset(pfnd_dat, 0, sizeof(FILE_UNIX_BASIC_INFO));
-/*      __le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
-        __le64 pfnd_dat->NumOfBytes = cpu_to_le64(0);
-        __u64 UniqueId = 0;  */
-        pfnd_dat->LastStatusChange =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->LastAccessTime =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->LastModificationTime =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->Type = cpu_to_le32(UNIX_DIR);
-        pfnd_dat->Permissions = cpu_to_le64(S_IXUGO | S_IRWXU);
-        pfnd_dat->Nlinks = cpu_to_le64(2);
-        if (sb->s_root)
-                pinode = sb->s_root->d_inode;
-        if (pinode == NULL)
-                return;
-        /* fill in default values for the remaining based on root
-           inode since we can not query the server for this inode info */
-        pfnd_dat->DevMajor = cpu_to_le64(MAJOR(pinode->i_rdev));
-        pfnd_dat->DevMinor = cpu_to_le64(MINOR(pinode->i_rdev));
-        pfnd_dat->Uid = cpu_to_le64(pinode->i_uid);
-        pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
-}
-/**
- * cifs_new inode - create new inode, initialize, and hash it
- * @sb - pointer to superblock
- * @inum - if valid pointer and serverino is enabled, replace i_ino with val
- *
- * Create a new inode, initialize it for CIFS and hash it. Returns the new
- * inode or NULL if one couldn't be allocated.
 *
- * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * Needed to setup cifs_fattr data for the directory which is the
- * we'll just use the inode number assigned by new_inode(). Note that this can
+ * junction to the new submount (ie to setup the fake directory
- * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * which represents a DFS referral).
- * guaranteed to be unique.
 */
-struct inode *
+static void
-cifs_new_inode(struct super_block *sb, __u64 *inum)
+cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
-        struct inode *inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        inode = new_inode(sb);
-        if (inode == NULL)
-                return NULL;
-        /*
-         * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
-         *     stop passing inum as ptr. Are there sanity checks we can use to
-         *     ensure that the server is really filling in that field? Also,
-         *     if serverino is disabled, perhaps we should be using iunique()?
-         */
-        if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
-                inode->i_ino = (unsigned long) *inum;
-        /*
-         * must set this here instead of cifs_alloc_inode since VFS will
-         * clobber i_flags
-         */
-        if (sb->s_flags & MS_NOATIME)
-                inode->i_flags |= S_NOATIME | S_NOCMTIME;
-        insert_inode_hash(inode);
-        return inode;
+        cFYI(1, ("creating fake fattr for DFS referral"));
+        memset(fattr, 0, sizeof(*fattr));
+        fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
+        fattr->cf_uid = cifs_sb->mnt_uid;
+        fattr->cf_gid = cifs_sb->mnt_gid;
+        fattr->cf_atime = CURRENT_TIME;
+        fattr->cf_ctime = CURRENT_TIME;
+        fattr->cf_mtime = CURRENT_TIME;
+        fattr->cf_nlink = 2;
+        fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
 int cifs_get_inode_info_unix(struct inode **pinode,
-        const unsigned char *full_path, struct super_block *sb, int xid)
+                             const unsigned char *full_path,
+                             struct super_block *sb, int xid)
 {
-        int rc = 0;
+        int rc;
        FILE_UNIX_BASIC_INFO find_data;
-        struct cifsTconInfo *pTcon;
+        struct cifs_fattr fattr;
-        struct inode *inode;
+        struct cifsTconInfo *tcon;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        bool is_dfs_referral = false;
-        struct cifsInodeInfo *cifsInfo;
-        __u64 num_of_bytes;
-        __u64 end_of_file;
-        pTcon = cifs_sb->tcon;
+        tcon = cifs_sb->tcon;
        cFYI(1, ("Getting info on %s", full_path));
        /* could have done a find first instead but this returns more info */
-        rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
+        rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc == -EREMOTE && !is_dfs_referral) {
-                is_dfs_referral = true;
-                cFYI(DBG2, ("DFS ref"));
-                /* for DFS, server does not give us real inode data */
-                fill_fake_finddataunix(&find_data, sb);
-                rc = 0;
-        } else if (rc)
-                goto cgiiu_exit;
-        num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
+        if (!rc) {
-        end_of_file = le64_to_cpu(find_data.EndOfFile);
+                cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, sb);
+                rc = 0;
+        } else {
+                return rc;
+        }
-        /* get new inode */
        if (*pinode == NULL) {
-                __u64 unique_id = le64_to_cpu(find_data.UniqueId);
+                /* get new inode */
-                *pinode = cifs_new_inode(sb, &unique_id);
+                *pinode = cifs_iget(sb, &fattr);
-                if (*pinode == NULL) {
+                if (!*pinode)
                        rc = -ENOMEM;
-                        goto cgiiu_exit;
+        } else {
-                }
+                /* we already have inode, update it */
+                cifs_fattr_to_inode(*pinode, &fattr);
        }
-        inode = *pinode;
-        cifsInfo = CIFS_I(inode);
-        cFYI(1, ("Old time %ld", cifsInfo->time));
-        cifsInfo->time = jiffies;
-        cFYI(1, ("New time %ld", cifsInfo->time));
-        /* this is ok to set on every inode revalidate */
-        atomic_set(&cifsInfo->inUse, 1);
-        cifs_unix_info_to_inode(inode, &find_data, 0);
-        if (num_of_bytes < end_of_file)
-                cFYI(1, ("allocation size less than end of file"));
-        cFYI(1, ("Size %ld and blocks %llu",
-                (unsigned long) inode->i_size,
-                (unsigned long long)inode->i_blocks));
-        cifs_set_ops(inode, is_dfs_referral);
-cgiiu_exit:
        return rc;
 }
-static int decode_sfu_inode(struct inode *inode, __u64 size,
+static int
-                            const unsigned char *path,
+cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
-                            struct cifs_sb_info *cifs_sb, int xid)
+              struct cifs_sb_info *cifs_sb, int xid)
 {
        int rc;
        int oplock = 0;
@@ -321,10 +284,15 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
        pbuf = buf;
-        if (size == 0) {
+        fattr->cf_mode &= ~S_IFMT;
-                inode->i_mode |= S_IFIFO;
+        if (fattr->cf_eof == 0) {
+                fattr->cf_mode |= S_IFIFO;
+                fattr->cf_dtype = DT_FIFO;
                return 0;
-        } else if (size < 8) {
+        } else if (fattr->cf_eof < 8) {
+                fattr->cf_mode |= S_IFREG;
+                fattr->cf_dtype = DT_REG;
                return -EINVAL;  /* EOPNOTSUPP? */
        }
@@ -336,42 +304,46 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
        if (rc == 0) {
                int buf_type = CIFS_NO_BUFFER;
                        /* Read header */
-                rc = CIFSSMBRead(xid, pTcon,
+                rc = CIFSSMBRead(xid, pTcon, netfid,
-                                 netfid,
                                 24 /* length */, 0 /* offset */,
                                 &bytes_read, &pbuf, &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
                                cFYI(1, ("Block device"));
-                                inode->i_mode |= S_IFBLK;
+                                fattr->cf_mode |= S_IFBLK;
+                                fattr->cf_dtype = DT_BLK;
                                if (bytes_read == 24) {
                                        /* we have enough to decode dev num */
                                        __u64 mjr; /* major */
                                        __u64 mnr; /* minor */
                                        mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-                                        inode->i_rdev = MKDEV(mjr, mnr);
+                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
                                cFYI(1, ("Char device"));
-                                inode->i_mode |= S_IFCHR;
+                                fattr->cf_mode |= S_IFCHR;
+                                fattr->cf_dtype = DT_CHR;
                                if (bytes_read == 24) {
                                        /* we have enough to decode dev num */
                                        __u64 mjr; /* major */
                                        __u64 mnr; /* minor */
                                        mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-                                        inode->i_rdev = MKDEV(mjr, mnr);
+                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
                                cFYI(1, ("Symlink"));
-                                inode->i_mode |= S_IFLNK;
+                                fattr->cf_mode |= S_IFLNK;
+                                fattr->cf_dtype = DT_LNK;
                        } else {
-                                inode->i_mode |= S_IFREG; /* file? */
+                                fattr->cf_mode |= S_IFREG; /* file? */
+                                fattr->cf_dtype = DT_REG;
                                rc = -EOPNOTSUPP;
                        }
                } else {
-                        inode->i_mode |= S_IFREG; /* then it is a file */
+                        fattr->cf_mode |= S_IFREG; /* then it is a file */
+                        fattr->cf_dtype = DT_REG;
                        rc = -EOPNOTSUPP; /* or some unknown SFU type */
                }
                CIFSSMBClose(xid, pTcon, netfid);
@@ -381,9 +353,13 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
 #define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID)  /* SETFILEBITS valid bits */
-static int get_sfu_mode(struct inode *inode,
+/*
-                        const unsigned char *path,
+ * Fetch mode bits as provided by SFU.
-                        struct cifs_sb_info *cifs_sb, int xid)
+ *
+ * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ?
+ */
+static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
+                         struct cifs_sb_info *cifs_sb, int xid)
 {
 #ifdef CONFIG_CIFS_XATTR
        ssize_t rc;
@@ -391,68 +367,80 @@ static int get_sfu_mode(struct inode *inode,
        __u32 mode;
        rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
-                        ea_value, 4 /* size of buf */, cifs_sb->local_nls,
+                            ea_value, 4 /* size of buf */, cifs_sb->local_nls,
-                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                            cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc < 0)
                return (int)rc;
        else if (rc > 3) {
                mode = le32_to_cpu(*((__le32 *)ea_value));
-                inode->i_mode &= ~SFBITS_MASK;
+                fattr->cf_mode &= ~SFBITS_MASK;
-                cFYI(1, ("special bits 0%o org mode 0%o", mode, inode->i_mode));
+                cFYI(1, ("special bits 0%o org mode 0%o", mode,
-                inode->i_mode = (mode &  SFBITS_MASK) | inode->i_mode;
+                         fattr->cf_mode));
+                fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
                cFYI(1, ("special mode bits 0%o", mode));
-                return 0;
-        } else {
-                return 0;
        }
+        return 0;
 #else
        return -EOPNOTSUPP;
 #endif
 }
-/*
+/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
- *      Needed to setup inode data for the directory which is the
+static void
- *      junction to the new submount (ie to setup the fake directory
+cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- *      which represents a DFS referral)
+                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
- */
-static void fill_fake_finddata(FILE_ALL_INFO *pfnd_dat,
-                               struct super_block *sb)
 {
-        memset(pfnd_dat, 0, sizeof(FILE_ALL_INFO));
+        memset(fattr, 0, sizeof(*fattr));
+        fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
-/*      __le64 pfnd_dat->AllocationSize = cpu_to_le64(0);
+        if (info->DeletePending)
-        __le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
+                fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING;
-        __u8 pfnd_dat->DeletePending = 0;
-        __u8 pfnd_data->Directory = 0;
+        if (info->LastAccessTime)
-        __le32 pfnd_dat->EASize = 0;
+                fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
-        __u64 pfnd_dat->IndexNumber = 0;
+        else
-        __u64 pfnd_dat->IndexNumber1 = 0;  */
+                fattr->cf_atime = CURRENT_TIME;
-        pfnd_dat->CreationTime =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
-        pfnd_dat->LastAccessTime =
+        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->LastWriteTime =
+        if (adjust_tz) {
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+                fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
-        pfnd_dat->ChangeTime =
+                fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        }
-        pfnd_dat->Attributes = cpu_to_le32(ATTR_DIRECTORY);
-        pfnd_dat->NumberOfLinks = cpu_to_le32(2);
+        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
+                fattr->cf_dtype = DT_DIR;
+        } else {
+                fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
+                fattr->cf_dtype = DT_REG;
+                /* clear write bits if ATTR_READONLY is set */
+                if (fattr->cf_cifsattrs & ATTR_READONLY)
+                        fattr->cf_mode &= ~(S_IWUGO);
+        }
+        fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+        fattr->cf_uid = cifs_sb->mnt_uid;
+        fattr->cf_gid = cifs_sb->mnt_gid;
 }
 int cifs_get_inode_info(struct inode **pinode,
        const unsigned char *full_path, FILE_ALL_INFO *pfindData,
        struct super_block *sb, int xid, const __u16 *pfid)
 {
-        int rc = 0;
+        int rc = 0, tmprc;
-        __u32 attr;
-        struct cifsInodeInfo *cifsInfo;
        struct cifsTconInfo *pTcon;
-        struct inode *inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        char *buf = NULL;
        bool adjustTZ = false;
-        bool is_dfs_referral = false;
+        struct cifs_fattr fattr;
-        umode_t default_mode;
        pTcon = cifs_sb->tcon;
        cFYI(1, ("Getting info on %s", full_path));
@@ -487,163 +475,85 @@ int cifs_get_inode_info(struct inode **pinode,
                        adjustTZ = true;
                }
        }
-        /* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */
-        if (rc == -EREMOTE) {
+        if (!rc) {
-                is_dfs_referral = true;
+                cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *) pfindData,
-                fill_fake_finddata(pfindData, sb);
+                                       cifs_sb, adjustTZ);
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, sb);
                rc = 0;
-        } else if (rc)
+        } else {
                goto cgii_exit;
+        }
-        attr = le32_to_cpu(pfindData->Attributes);
+        /*
+         * If an inode wasn't passed in, then get the inode number
-        /* get new inode */
+         *
+         * Is an i_ino of zero legal? Can we use that to check if the server
+         * supports returning inode numbers?  Are there other sanity checks we
+         * can use to ensure that the server is really filling in that field?
+         *
+         * We can not use the IndexNumber field by default from Windows or
+         * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA
+         * CIFS spec claims that this value is unique within the scope of a
+         * share, and the windows docs hint that it's actually unique
+         * per-machine.
+         *
+         * There may be higher info levels that work but are there Windows
+         * server or network appliances for which IndexNumber field is not
+         * guaranteed unique?
+         */
        if (*pinode == NULL) {
-                __u64 inode_num;
-                __u64 *pinum = &inode_num;
-                /* Is an i_ino of zero legal? Can we use that to check
-                   if the server supports returning inode numbers?  Are
-                   there other sanity checks we can use to ensure that
-                   the server is really filling in that field? */
-                /* We can not use the IndexNumber field by default from
-                   Windows or Samba (in ALL_INFO buf) but we can request
-                   it explicitly.  It may not be unique presumably if
-                   the server has multiple devices mounted under one share */
-                /* There may be higher info levels that work but are
-                   there Windows server or network appliances for which
-                   IndexNumber field is not guaranteed unique? */
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                        int rc1 = 0;
                        rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-                                        full_path, pinum,
+                                        full_path, &fattr.cf_uniqueid,
                                        cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1) {
                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
-                                pinum = NULL;
+                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
-                                /* BB EOPNOSUPP disable SERVER_INUM? */
+                                /* disable serverino if call not supported */
+                                if (rc1 == -EINVAL)
+                                        cifs_sb->mnt_cifs_flags &=
+                                                        ~CIFS_MOUNT_SERVER_INUM;
                        }
                } else {
-                        pinum = NULL;
+                        fattr.cf_uniqueid = iunique(sb, ROOT_I);
-                }
-                *pinode = cifs_new_inode(sb, pinum);
-                if (*pinode == NULL) {
-                        rc = -ENOMEM;
-                        goto cgii_exit;
                }
-        }
-        inode = *pinode;
-        cifsInfo = CIFS_I(inode);
-        cifsInfo->cifsAttrs = attr;
-        cifsInfo->delete_pending = pfindData->DeletePending ? true : false;
-        cFYI(1, ("Old time %ld", cifsInfo->time));
-        cifsInfo->time = jiffies;
-        cFYI(1, ("New time %ld", cifsInfo->time));
-        /* blksize needs to be multiple of two. So safer to default to
-        blksize and blkbits set in superblock so 2**blkbits and blksize
-        will match rather than setting to:
-        (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
-        /* Linux can not store file creation time so ignore it */
-        if (pfindData->LastAccessTime)
-                inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
-        else /* do not need to use current_fs_time - time not stored */
-                inode->i_atime = CURRENT_TIME;
-        inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
-        inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
-        cFYI(DBG2, ("Attributes came in as 0x%x", attr));
-        if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
-                inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
-                inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
-        }
-        /* get default inode mode */
-        if (attr & ATTR_DIRECTORY)
-                default_mode = cifs_sb->mnt_dir_mode;
-        else
-                default_mode = cifs_sb->mnt_file_mode;
-        /* set permission bits */
-        if (atomic_read(&cifsInfo->inUse) == 0 ||
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
-                inode->i_mode = default_mode;
-        else {
-                /* just reenable write bits if !ATTR_READONLY */
-                if ((inode->i_mode & S_IWUGO) == 0 &&
-                    (attr & ATTR_READONLY) == 0)
-                        inode->i_mode |= (S_IWUGO & default_mode);
-                inode->i_mode &= ~S_IFMT;
-        }
-        /* clear write bits if ATTR_READONLY is set */
-        if (attr & ATTR_READONLY)
-                inode->i_mode &= ~S_IWUGO;
-        /* set inode type */
-        if ((attr & ATTR_SYSTEM) &&
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
-                /* no need to fix endianness on 0 */
-                if (pfindData->EndOfFile == 0)
-                        inode->i_mode |= S_IFIFO;
-                else if (decode_sfu_inode(inode,
-                                le64_to_cpu(pfindData->EndOfFile),
-                                full_path, cifs_sb, xid))
-                        cFYI(1, ("unknown SFU file type\n"));
        } else {
-                if (attr & ATTR_DIRECTORY)
+                fattr.cf_uniqueid = CIFS_I(*pinode)->uniqueid;
-                        inode->i_mode |= S_IFDIR;
-                else
-                        inode->i_mode |= S_IFREG;
        }
-        cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile);
+        /* query for SFU type info if supported and needed */
-        spin_lock(&inode->i_lock);
+        if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
-        if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) {
+            cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                /* can not safely shrink the file size here if the
+                tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
-                   client is writing to it due to potential races */
+                if (tmprc)
-                i_size_write(inode, cifsInfo->server_eof);
+                        cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
-                /* 512 bytes (2**9) is the fake blocksize that must be
-                   used for this calculation */
-                inode->i_blocks = (512 - 1 + le64_to_cpu(
-                                   pfindData->AllocationSize)) >> 9;
        }
-        spin_unlock(&inode->i_lock);
-        inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
-        /* BB fill in uid and gid here? with help from winbind?
-           or retrieve from NTFS stream extended attribute */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                cFYI(1, ("Getting mode bits from ACL"));
-                acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
+                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
        }
 #endif
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                /* fill in remaining high mode bits e.g. SUID, VTX */
-                get_sfu_mode(inode, full_path, cifs_sb, xid);
-        } else if (atomic_read(&cifsInfo->inUse) == 0) {
-                inode->i_uid = cifs_sb->mnt_uid;
-                inode->i_gid = cifs_sb->mnt_gid;
-                /* set so we do not keep refreshing these fields with
-                   bad data after user has changed them in memory */
-                atomic_set(&cifsInfo->inUse, 1);
-        }
-        cifs_set_ops(inode, is_dfs_referral);
+        /* fill in remaining high mode bits e.g. SUID, VTX */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+                cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
+        if (!*pinode) {
+                *pinode = cifs_iget(sb, &fattr);
+                if (!*pinode)
+                        rc = -ENOMEM;
+        } else {
+                cifs_fattr_to_inode(*pinode, &fattr);
+        }
 cgii_exit:
        kfree(buf);
@@ -695,33 +605,78 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
        return full_path;
 }
+static int
+cifs_find_inode(struct inode *inode, void *opaque)
+{
+        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
+                return 0;
+        return 1;
+}
+static int
+cifs_init_inode(struct inode *inode, void *opaque)
+{
+        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+        CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+        return 0;
+}
+/* Given fattrs, get a corresponding inode */
+struct inode *
+cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
+{
+        unsigned long hash;
+        struct inode *inode;
+        cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
+        /* hash down to 32-bits on 32-bit arch */
+        hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
+        inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
+        /* we have fattrs in hand, update the inode */
+        if (inode) {
+                cifs_fattr_to_inode(inode, fattr);
+                if (sb->s_flags & MS_NOATIME)
+                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
+                if (inode->i_state & I_NEW) {
+                        inode->i_ino = hash;
+                        unlock_new_inode(inode);
+                }
+        }
+        return inode;
+}
 /* gets root inode */
 struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
        int xid;
        struct cifs_sb_info *cifs_sb;
-        struct inode *inode;
+        struct inode *inode = NULL;
        long rc;
        char *full_path;
-        inode = iget_locked(sb, ino);
+        cifs_sb = CIFS_SB(sb);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        if (!(inode->i_state & I_NEW))
-                return inode;
-        cifs_sb = CIFS_SB(inode->i_sb);
        full_path = cifs_build_path_to_root(cifs_sb);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
        xid = GetXid();
        if (cifs_sb->tcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
-                                                xid);
        else
-                rc = cifs_get_inode_info(&inode, full_path, NULL, inode->i_sb,
+                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
                                                xid, NULL);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
        if (rc && cifs_sb->tcon->ipc) {
                cFYI(1, ("ipc connection - fake read inode"));
                inode->i_mode |= S_IFDIR;
@@ -737,7 +692,6 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                return ERR_PTR(rc);
        }
-        unlock_new_inode(inode);
        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
@@ -988,8 +942,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
         * sb->s_vfs_rename_mutex here */
        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1062,44 +1017,6 @@ out_reval:
        return rc;
 }
-void posix_fill_in_inode(struct inode *tmp_inode,
-        FILE_UNIX_BASIC_INFO *pData, int isNewInode)
-{
-        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-        loff_t local_size;
-        struct timespec local_mtime;
-        cifsInfo->time = jiffies;
-        atomic_inc(&cifsInfo->inUse);
-        /* save mtime and size */
-        local_mtime = tmp_inode->i_mtime;
-        local_size  = tmp_inode->i_size;
-        cifs_unix_info_to_inode(tmp_inode, pData, 1);
-        cifs_set_ops(tmp_inode, false);
-        if (!S_ISREG(tmp_inode->i_mode))
-                return;
-        /*
-         * No sense invalidating pages for new inode
-         * since we we have not started caching
-         * readahead file data yet.
-         */
-        if (isNewInode)
-                return;
-        if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
-                (local_size == tmp_inode->i_size)) {
-                cFYI(1, ("inode exists but unchanged"));
-        } else {
-                /* file may have changed on server */
-                cFYI(1, ("invalidate inode, readdir detected change"));
-                invalidate_remote_inode(tmp_inode);
-        }
-}
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
        int rc = 0, tmprc;
@@ -1108,6 +1025,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
+        struct cifs_fattr fattr;
        cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
@@ -1118,8 +1036,9 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1146,7 +1065,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        cFYI(1, ("posix mkdir returned 0x%x", rc));
                        d_drop(direntry);
                } else {
-                        __u64 unique_id;
                        if (pInfo->Type == cpu_to_le32(-1)) {
                                /* no return info, go query for it */
                                kfree(pInfo);
@@ -1160,20 +1078,15 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        else
                                direntry->d_op = &cifs_dentry_ops;
-                        unique_id = le64_to_cpu(pInfo->UniqueId);
+                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
-                        newinode = cifs_new_inode(inode->i_sb, &unique_id);
+                        newinode = cifs_iget(inode->i_sb, &fattr);
-                        if (newinode == NULL) {
+                        if (!newinode) {
                                kfree(pInfo);
                                goto mkdir_get_info;
                        }
-                        newinode->i_nlink = 2;
                        d_instantiate(direntry, newinode);
-                        /* we already checked in POSIXCreate whether
-                           frame was long enough */
-                        posix_fill_in_inode(direntry->d_inode,
-                                        pInfo, 1 /* NewInode */);
 #ifdef CONFIG_CIFS_DEBUG2
                        cFYI(1, ("instantiated dentry %p %s to inode %p",
                                direntry, direntry->d_name.name, newinode));
@@ -1236,10 +1149,10 @@ mkdir_get_info:
                                args.uid = NO_CHANGE_64;
                                args.gid = NO_CHANGE_64;
                        }
-                        CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+                        CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
-                                            cifs_sb->local_nls,
+                                               cifs_sb->local_nls,
-                                            cifs_sb->mnt_cifs_flags &
+                                               cifs_sb->mnt_cifs_flags &
-                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
                            (mode & S_IWUGO) == 0) {
@@ -1303,8 +1216,9 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
@@ -1508,8 +1422,9 @@ int cifs_revalidate(struct dentry *direntry)
           since that would deadlock */
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
                 "jiffies %ld", full_path, direntry->d_inode,
@@ -1618,6 +1533,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        if (!err) {
                generic_fillattr(dentry->d_inode, stat);
                stat->blksize = CIFS_MAX_MSGSIZE;
+                stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
        }
        return err;
 }
@@ -1782,6 +1698,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsTconInfo *pTcon = cifs_sb->tcon;
        struct cifs_unix_set_info_args *args = NULL;
+        struct cifsFileInfo *open_file;
        cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
                 direntry->d_name.name, attrs->ia_valid));
@@ -1868,10 +1785,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                args->ctime = NO_CHANGE_64;
        args->device = 0;
-        rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args,
+        open_file = find_writable_file(cifsInode);
-                                cifs_sb->local_nls,
+        if (open_file) {
-                                cifs_sb->mnt_cifs_flags &
+                u16 nfid = open_file->netfid;
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                u32 npid = open_file->pid;
+                rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
+                atomic_dec(&open_file->wrtPending);
+        } else {
+                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
+                                    cifs_sb->local_nls,
+                                    cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        }
        if (!rc)
                rc = inode_setattr(inode, attrs);
@@ -1911,8 +1836,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /*
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cd83c53fcbb5..fc1e0487eaee 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -172,8 +172,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        cFYI(1, ("Full path: %s", full_path));
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 32d6baa0a54f..bd6d6895730d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -133,10 +133,12 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
        {0, 0}
 };
-/* Convert string containing dotted ip address to binary form */
+/*
-/* returns 0 if invalid address */
+ * Convert a string containing text IPv4 or IPv6 address to binary form.
+ *
-int
+ * Returns 0 on failure.
+ */
+static int
 cifs_inet_pton(const int address_family, const char *cp, void *dst)
 {
        int ret = 0;
@@ -153,6 +155,52 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
        return ret;
 }
+/*
+ * Try to convert a string to an IPv4 address and then attempt to convert
+ * it to an IPv6 address if that fails. Set the family field if either
+ * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to
+ * treat the part following it as a numeric sin6_scope_id.
+ *
+ * Returns 0 on failure.
+ */
+int
+cifs_convert_address(char *src, void *dst)
+{
+        int rc;
+        char *pct, *endp;
+        struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
+        struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
+        /* IPv4 address */
+        if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) {
+                s4->sin_family = AF_INET;
+                return 1;
+        }
+        /* temporarily terminate string */
+        pct = strchr(src, '%');
+        if (pct)
+                *pct = '\0';
+        rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
+        /* repair temp termination (if any) and make pct point to scopeid */
+        if (pct)
+                *pct++ = '%';
+        if (!rc)
+                return rc;
+        s6->sin6_family = AF_INET6;
+        if (pct) {
+                s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
+                if (!*pct || *endp)
+                        return 0;
+        }
+        return rc;
+}
 /*****************************************************************************
 convert a NT status code to a dos class/code
 *****************************************************************************/
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 86d0055dc529..f823a4a208a7 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -63,374 +63,123 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
-/* Returns 1 if new inode created, 2 if both dentry and inode were */
+/*
-/* Might check in the future if inode number changed so we can rehash inode */
+ * Find the dentry that matches "name". If there isn't one, create one. If it's
-static int
+ * a negative dentry or the uniqueid changed, then drop it and recreate it.
-construct_dentry(struct qstr *qstring, struct file *file,
+ */
-                 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+static struct dentry *
-                 __u64 *inum)
+cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
+                    struct cifs_fattr *fattr)
 {
-        struct dentry *tmp_dentry = NULL;
+        struct dentry *dentry, *alias;
-        struct super_block *sb = file->f_path.dentry->d_sb;
+        struct inode *inode;
-        int rc = 0;
+        struct super_block *sb = parent->d_inode->i_sb;
+        cFYI(1, ("For %s", name->name));
+        dentry = d_lookup(parent, name);
+        if (dentry) {
+                /* FIXME: check for inode number changes? */
+                if (dentry->d_inode != NULL)
+                        return dentry;
+                d_drop(dentry);
+                dput(dentry);
+        }
-        cFYI(1, ("For %s", qstring->name));
+        dentry = d_alloc(parent, name);
+        if (dentry == NULL)
-        qstring->hash = full_name_hash(qstring->name, qstring->len);
+                return NULL;
-        tmp_dentry = d_lookup(file->f_path.dentry, qstring);
-        if (tmp_dentry) {
-                /* BB: overwrite old name? i.e. tmp_dentry->d_name and
-                 * tmp_dentry->d_name.len??
-                 */
-                cFYI(0, ("existing dentry with inode 0x%p",
-                         tmp_dentry->d_inode));
-                *ptmp_inode = tmp_dentry->d_inode;
-                if (*ptmp_inode == NULL) {
-                        *ptmp_inode = cifs_new_inode(sb, inum);
-                        if (*ptmp_inode == NULL)
-                                return rc;
-                        rc = 1;
-                }
-        } else {
-                tmp_dentry = d_alloc(file->f_path.dentry, qstring);
-                if (tmp_dentry == NULL) {
-                        cERROR(1, ("Failed allocating dentry"));
-                        *ptmp_inode = NULL;
-                        return rc;
-                }
-                if (CIFS_SB(sb)->tcon->nocase)
+        inode = cifs_iget(sb, fattr);
-                        tmp_dentry->d_op = &cifs_ci_dentry_ops;
+        if (!inode) {
-                else
+                dput(dentry);
-                        tmp_dentry->d_op = &cifs_dentry_ops;
+                return NULL;
+        }
-                *ptmp_inode = cifs_new_inode(sb, inum);
+        if (CIFS_SB(sb)->tcon->nocase)
-                if (*ptmp_inode == NULL)
+                dentry->d_op = &cifs_ci_dentry_ops;
-                        return rc;
+        else
-                rc = 2;
+                dentry->d_op = &cifs_dentry_ops;
+        alias = d_materialise_unique(dentry, inode);
+        if (alias != NULL) {
+                dput(dentry);
+                if (IS_ERR(alias))
+                        return NULL;
+                dentry = alias;
        }
-        tmp_dentry->d_time = jiffies;
+        return dentry;
-        *pnew_dentry = tmp_dentry;
-        return rc;
 }
-static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
+static void
-                          char *buf, unsigned int *pobject_type, int isNewInode)
+cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 {
-        loff_t local_size;
+        fattr->cf_uid = cifs_sb->mnt_uid;
-        struct timespec local_mtime;
+        fattr->cf_gid = cifs_sb->mnt_gid;
-        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-        struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
-        __u32 attr;
-        __u64 allocation_size;
-        __u64 end_of_file;
-        umode_t default_mode;
-        /* save mtime and size */
-        local_mtime = tmp_inode->i_mtime;
-        local_size  = tmp_inode->i_size;
-        if (new_buf_type) {
-                FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
-                attr = le32_to_cpu(pfindData->ExtFileAttributes);
-                allocation_size = le64_to_cpu(pfindData->AllocationSize);
-                end_of_file = le64_to_cpu(pfindData->EndOfFile);
-                tmp_inode->i_atime =
-                        cifs_NTtimeToUnix(pfindData->LastAccessTime);
-                tmp_inode->i_mtime =
-                        cifs_NTtimeToUnix(pfindData->LastWriteTime);
-                tmp_inode->i_ctime =
-                        cifs_NTtimeToUnix(pfindData->ChangeTime);
-        } else { /* legacy, OS2 and DOS style */
-                int offset = cifs_sb->tcon->ses->server->timeAdj;
-                FIND_FILE_STANDARD_INFO *pfindData =
-                        (FIND_FILE_STANDARD_INFO *)buf;
-                tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
-                                                    pfindData->LastWriteTime,
-                                                    offset);
-                tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
-                                                    pfindData->LastAccessTime,
-                                                    offset);
-                tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
-                                                    pfindData->LastWriteTime,
-                                                    offset);
-                attr = le16_to_cpu(pfindData->Attributes);
-                allocation_size = le32_to_cpu(pfindData->AllocationSize);
-                end_of_file = le32_to_cpu(pfindData->DataSize);
-        }
-        /* Linux can not store file creation time unfortunately so ignore it */
+        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
-        cifsInfo->cifsAttrs = attr;
+                fattr->cf_dtype = DT_DIR;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+        } else {
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+                fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
-                /* get more accurate mode via ACL - so force inode refresh */
+                fattr->cf_dtype = DT_REG;
-                cifsInfo->time = 0;
-        } else
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
-                cifsInfo->time = jiffies;
-        /* treat dos attribute of read-only as read-only mode bit e.g. 555? */
-        /* 2767 perms - indicate mandatory locking */
-                /* BB fill in uid and gid here? with help from winbind?
-                   or retrieve from NTFS stream extended attribute */
-        if (atomic_read(&cifsInfo->inUse) == 0) {
-                tmp_inode->i_uid = cifs_sb->mnt_uid;
-                tmp_inode->i_gid = cifs_sb->mnt_gid;
-        }
-        if (attr & ATTR_DIRECTORY)
-                default_mode = cifs_sb->mnt_dir_mode;
-        else
-                default_mode = cifs_sb->mnt_file_mode;
-        /* set initial permissions */
-        if ((atomic_read(&cifsInfo->inUse) == 0) ||
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
-                tmp_inode->i_mode = default_mode;
-        else {
-                /* just reenable write bits if !ATTR_READONLY */
-                if ((tmp_inode->i_mode & S_IWUGO) == 0 &&
-                    (attr & ATTR_READONLY) == 0)
-                        tmp_inode->i_mode |= (S_IWUGO & default_mode);
-                tmp_inode->i_mode &= ~S_IFMT;
        }
-        /* clear write bits if ATTR_READONLY is set */
+        if (fattr->cf_cifsattrs & ATTR_READONLY)
-        if (attr & ATTR_READONLY)
+                fattr->cf_mode &= ~S_IWUGO;
-                tmp_inode->i_mode &= ~S_IWUGO;
-        /* set inode type */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
-        if ((attr & ATTR_SYSTEM) &&
+            fattr->cf_cifsattrs & ATTR_SYSTEM) {
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
+                if (fattr->cf_eof == 0)  {
-                if (end_of_file == 0)  {
+                        fattr->cf_mode &= ~S_IFMT;
-                        tmp_inode->i_mode |= S_IFIFO;
+                        fattr->cf_mode |= S_IFIFO;
-                        *pobject_type = DT_FIFO;
+                        fattr->cf_dtype = DT_FIFO;
                } else {
                        /*
-                         * trying to get the type can be slow, so just call
+                         * trying to get the type and mode via SFU can be slow,
-                         * this a regular file for now, and mark for reval
+                         * so just call those regular files for now, and mark
+                         * for reval
                         */
-                        tmp_inode->i_mode |= S_IFREG;
+                        fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
-                        *pobject_type = DT_REG;
-                        cifsInfo->time = 0;
-                }
-        } else {
-                if (attr & ATTR_DIRECTORY) {
-                        tmp_inode->i_mode |= S_IFDIR;
-                        *pobject_type = DT_DIR;
-                } else {
-                        tmp_inode->i_mode |= S_IFREG;
-                        *pobject_type = DT_REG;
                }
        }
+}
-        /* can not fill in nlink here as in qpathinfo version and Unx search */
+void
-        if (atomic_read(&cifsInfo->inUse) == 0)
+cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
-                atomic_set(&cifsInfo->inUse, 1);
+                       struct cifs_sb_info *cifs_sb)
+{
-        cifsInfo->server_eof = end_of_file;
+        memset(fattr, 0, sizeof(*fattr));
-        spin_lock(&tmp_inode->i_lock);
+        fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
-        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
+        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
-                /* can not safely change the file size here if the
+        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
-                client is writing to it due to potential races */
+        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
-                i_size_write(tmp_inode, end_of_file);
+        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
+        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
-        /* 512 bytes (2**9) is the fake blocksize that must be used */
-        /* for this calculation, even though the reported blocksize is larger */
+        cifs_fill_common_info(fattr, cifs_sb);
-                tmp_inode->i_blocks = (512 - 1 + allocation_size) >> 9;
-        }
-        spin_unlock(&tmp_inode->i_lock);
-        if (allocation_size < end_of_file)
-                cFYI(1, ("May be sparse file, allocation less than file size"));
-        cFYI(1, ("File Size %ld and blocks %llu",
-                (unsigned long)tmp_inode->i_size,
-                (unsigned long long)tmp_inode->i_blocks));
-        if (S_ISREG(tmp_inode->i_mode)) {
-                cFYI(1, ("File inode"));
-                tmp_inode->i_op = &cifs_file_inode_ops;
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
-                        else
-                                tmp_inode->i_fop = &cifs_file_direct_ops;
-                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
-                else
-                        tmp_inode->i_fop = &cifs_file_ops;
-                if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
-                   (cifs_sb->tcon->ses->server->maxBuf <
-                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
-                else
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
-                if (isNewInode)
-                        return; /* No sense invalidating pages for new inode
-                                   since have not started caching readahead file
-                                   data yet */
-                if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
-                        (local_size == tmp_inode->i_size)) {
-                        cFYI(1, ("inode exists but unchanged"));
-                } else {
-                        /* file may have changed on server */
-                        cFYI(1, ("invalidate inode, readdir detected change"));
-                        invalidate_remote_inode(tmp_inode);
-                }
-        } else if (S_ISDIR(tmp_inode->i_mode)) {
-                cFYI(1, ("Directory inode"));
-                tmp_inode->i_op = &cifs_dir_inode_ops;
-                tmp_inode->i_fop = &cifs_dir_ops;
-        } else if (S_ISLNK(tmp_inode->i_mode)) {
-                cFYI(1, ("Symbolic Link inode"));
-                tmp_inode->i_op = &cifs_symlink_inode_ops;
-        } else {
-                cFYI(1, ("Init special inode"));
-                init_special_inode(tmp_inode, tmp_inode->i_mode,
-                                   tmp_inode->i_rdev);
-        }
 }
-static void unix_fill_in_inode(struct inode *tmp_inode,
+void
-        FILE_UNIX_INFO *pfindData, unsigned int *pobject_type, int isNewInode)
+cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
+                       struct cifs_sb_info *cifs_sb)
 {
-        loff_t local_size;
+        int offset = cifs_sb->tcon->ses->server->timeAdj;
-        struct timespec local_mtime;
-        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-        struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
-        __u32 type = le32_to_cpu(pfindData->Type);
-        __u64 num_of_bytes = le64_to_cpu(pfindData->NumOfBytes);
-        __u64 end_of_file = le64_to_cpu(pfindData->EndOfFile);
-        cifsInfo->time = jiffies;
-        atomic_inc(&cifsInfo->inUse);
-        /* save mtime and size */
-        local_mtime = tmp_inode->i_mtime;
-        local_size  = tmp_inode->i_size;
-        tmp_inode->i_atime =
-            cifs_NTtimeToUnix(pfindData->LastAccessTime);
-        tmp_inode->i_mtime =
-            cifs_NTtimeToUnix(pfindData->LastModificationTime);
-        tmp_inode->i_ctime =
-            cifs_NTtimeToUnix(pfindData->LastStatusChange);
-        tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
-        /* since we set the inode type below we need to mask off type
-           to avoid strange results if bits above were corrupt */
-        tmp_inode->i_mode &= ~S_IFMT;
-        if (type == UNIX_FILE) {
-                *pobject_type = DT_REG;
-                tmp_inode->i_mode |= S_IFREG;
-        } else if (type == UNIX_SYMLINK) {
-                *pobject_type = DT_LNK;
-                tmp_inode->i_mode |= S_IFLNK;
-        } else if (type == UNIX_DIR) {
-                *pobject_type = DT_DIR;
-                tmp_inode->i_mode |= S_IFDIR;
-        } else if (type == UNIX_CHARDEV) {
-                *pobject_type = DT_CHR;
-                tmp_inode->i_mode |= S_IFCHR;
-                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor),
-                                le64_to_cpu(pfindData->DevMinor) & MINORMASK);
-        } else if (type == UNIX_BLOCKDEV) {
-                *pobject_type = DT_BLK;
-                tmp_inode->i_mode |= S_IFBLK;
-                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor),
-                                le64_to_cpu(pfindData->DevMinor) & MINORMASK);
-        } else if (type == UNIX_FIFO) {
-                *pobject_type = DT_FIFO;
-                tmp_inode->i_mode |= S_IFIFO;
-        } else if (type == UNIX_SOCKET) {
-                *pobject_type = DT_SOCK;
-                tmp_inode->i_mode |= S_IFSOCK;
-        } else {
-                /* safest to just call it a file */
-                *pobject_type = DT_REG;
-                tmp_inode->i_mode |= S_IFREG;
-                cFYI(1, ("unknown inode type %d", type));
-        }
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+        memset(fattr, 0, sizeof(*fattr));
-                tmp_inode->i_uid = cifs_sb->mnt_uid;
+        fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
-        else
+                                            info->LastAccessTime, offset);
-                tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
+        fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate,
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+                                            info->LastWriteTime, offset);
-                tmp_inode->i_gid = cifs_sb->mnt_gid;
+        fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate,
-        else
+                                            info->LastWriteTime, offset);
-                tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
-        tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
-        cifsInfo->server_eof = end_of_file;
-        spin_lock(&tmp_inode->i_lock);
-        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
-                /* can not safely change the file size here if the
-                client is writing to it due to potential races */
-                i_size_write(tmp_inode, end_of_file);
-        /* 512 bytes (2**9) is the fake blocksize that must be used */
-        /* for this calculation, not the real blocksize */
-                tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
-        }
-        spin_unlock(&tmp_inode->i_lock);
-        if (S_ISREG(tmp_inode->i_mode)) {
+        fattr->cf_cifsattrs = le16_to_cpu(info->Attributes);
-                cFYI(1, ("File inode"));
+        fattr->cf_bytes = le32_to_cpu(info->AllocationSize);
-                tmp_inode->i_op = &cifs_file_inode_ops;
+        fattr->cf_eof = le32_to_cpu(info->DataSize);
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+        cifs_fill_common_info(fattr, cifs_sb);
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
-                        else
-                                tmp_inode->i_fop = &cifs_file_direct_ops;
-                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
-                else
-                        tmp_inode->i_fop = &cifs_file_ops;
-                if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
-                   (cifs_sb->tcon->ses->server->maxBuf <
-                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
-                else
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
-                if (isNewInode)
-                        return; /* No sense invalidating pages for new inode
-                                   since we have not started caching readahead
-                                   file data for it yet */
-                if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
-                        (local_size == tmp_inode->i_size)) {
-                        cFYI(1, ("inode exists but unchanged"));
-                } else {
-                        /* file may have changed on server */
-                        cFYI(1, ("invalidate inode, readdir detected change"));
-                        invalidate_remote_inode(tmp_inode);
-                }
-        } else if (S_ISDIR(tmp_inode->i_mode)) {
-                cFYI(1, ("Directory inode"));
-                tmp_inode->i_op = &cifs_dir_inode_ops;
-                tmp_inode->i_fop = &cifs_dir_ops;
-        } else if (S_ISLNK(tmp_inode->i_mode)) {
-                cFYI(1, ("Symbolic Link inode"));
-                tmp_inode->i_op = &cifs_symlink_inode_ops;
-/* tmp_inode->i_fop = *//* do not need to set to anything */
-        } else {
-                cFYI(1, ("Special inode"));
-                init_special_inode(tmp_inode, tmp_inode->i_mode,
-                                   tmp_inode->i_rdev);
-        }
 }
 /* BB eventually need to add the following helper function to
@@ -872,7 +621,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        len = strnlen(filename, PATH_MAX);
                }
-                *pinum = le64_to_cpu(pFindData->UniqueId);
+                *pinum = le64_to_cpu(pFindData->basic.UniqueId);
        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO *pFindData =
                        (FILE_DIRECTORY_INFO *)current_entry;
@@ -932,11 +681,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
        int rc = 0;
        struct qstr qstring;
        struct cifsFileInfo *pCifsF;
-        unsigned int obj_type;
+        u64    inum;
-        __u64  inum;
+        ino_t  ino;
+        struct super_block *sb;
        struct cifs_sb_info *cifs_sb;
-        struct inode *tmp_inode;
        struct dentry *tmp_dentry;
+        struct cifs_fattr fattr;
        /* get filename and len into qstring */
        /* get dentry */
@@ -954,60 +704,53 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
        if (rc != 0)
                return 0;
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        sb = file->f_path.dentry->d_sb;
+        cifs_sb = CIFS_SB(sb);
        qstring.name = scratch_buf;
        rc = cifs_get_name_from_search_buf(&qstring, pfindEntry,
                        pCifsF->srch_inf.info_level,
                        pCifsF->srch_inf.unicode, cifs_sb,
-                        max_len,
+                        max_len, &inum /* returned */);
-                        &inum /* returned */);
        if (rc)
                return rc;
-        /* only these two infolevels return valid inode numbers */
-        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
-            pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
-                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
-                                        &inum);
-        else
-                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
-                                        NULL);
-        if ((tmp_inode == NULL) || (tmp_dentry == NULL))
-                return -ENOMEM;
-        /* we pass in rc below, indicating whether it is a new inode,
-           so we can figure out whether to invalidate the inode cached
-           data if the file has changed */
        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
-                unix_fill_in_inode(tmp_inode,
+                cifs_unix_basic_to_fattr(&fattr,
-                                   (FILE_UNIX_INFO *)pfindEntry,
+                                 &((FILE_UNIX_INFO *) pfindEntry)->basic,
-                                   &obj_type, rc);
+                                 cifs_sb);
        else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
-                fill_in_inode(tmp_inode, 0 /* old level 1 buffer type */,
+                cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *)
-                                pfindEntry, &obj_type, rc);
+                                        pfindEntry, cifs_sb);
        else
-                fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc);
+                cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
+                                        pfindEntry, cifs_sb);
-        if (rc) /* new inode - needs to be tied to dentry */ {
+        /* FIXME: make _to_fattr functions fill this out */
-                d_instantiate(tmp_dentry, tmp_inode);
+        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
-                if (rc == 2)
+                fattr.cf_uniqueid = inum;
-                        d_rehash(tmp_dentry);
+        else
-        }
+                fattr.cf_uniqueid = iunique(sb, ROOT_I);
+        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
+        tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
        rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
-                     tmp_inode->i_ino, obj_type);
+                     ino, fattr.cf_dtype);
+        /*
+         * we can not return filldir errors to the caller since they are
+         * "normal" when the stat blocksize is too small - we return remapped
+         * error instead
+         *
+         * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
+         * case already. Why should we be clobbering other errors from it?
+         */
        if (rc) {
                cFYI(1, ("filldir rc = %d", rc));
-                /* we can not return filldir errors to the caller
-                since they are "normal" when the stat blocksize
-                is too small - we return remapped error instead */
                rc = -EOVERFLOW;
        }
        dput(tmp_dentry);
        return rc;
 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 897a052270f9..7085a6275c4c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -802,7 +802,7 @@ ssetup_ntlmssp_authenticate:
 #endif /* CONFIG_CIFS_UPCALL */
        } else {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-                if ((experimEnabled > 1) && (type == RawNTLMSSP)) {
+                if (type == RawNTLMSSP) {
                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
                                cERROR(1, ("NTLMSSP requires Unicode support"));
                                rc = -ENOSYS;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index e9527eedc639..a75afa3dd9e1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -64,8 +64,9 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if (ea_name == NULL) {
                cFYI(1, ("Null xattr names not supported"));
@@ -118,8 +119,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -225,8 +227,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -351,8 +354,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
diff --git a/fs/compat.c b/fs/compat.c
index 6aefb776dfeb..94502dab972a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -32,7 +32,6 @@
 #include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/module.h>
@@ -471,7 +470,7 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
                ret = sys_fcntl(fd, cmd, (unsigned long)&f);
                set_fs(old_fs);
                if (cmd == F_GETLK && ret == 0) {
-                        /* GETLK was successfule and we need to return the data...
+                        /* GETLK was successful and we need to return the data...
                         * but it needs to fit in the compat structure.
                         * l_start shouldn't be too big, unless the original
                         * start + end is greater than COMPAT_OFF_T_MAX, in which
@@ -1486,8 +1485,8 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval < 0)
+        if (mutex_lock_interruptible(&current->cred_guard_mutex))
                goto out_free;
        current->in_execve = 1;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index b83f6bcfa51a..f91fd51b32e3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,6 +19,7 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/smp_lock.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
@@ -31,6 +32,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/vt.h>
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/ppp_defs.h>
@@ -94,7 +96,6 @@
 #include <linux/atm_tcp.h>
 #include <linux/sonet.h>
 #include <linux/atm_suni.h>
-#include <linux/mtd/mtd.h>
 #include <linux/usb.h>
 #include <linux/usbdevice_fs.h>
@@ -788,12 +789,6 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
        if (put_user(compat_ptr(data), &sgio->usr_ptr))
                return -EFAULT;
-        if (copy_in_user(&sgio->status, &sgio32->status,
-                         (4 * sizeof(unsigned char)) +
-                         (2 * sizeof(unsigned short)) +
-                         (3 * sizeof(int))))
-                return -EFAULT;
        err = sys_ioctl(fd, cmd, (unsigned long) sgio);
        if (err >= 0) {
@@ -1411,46 +1406,6 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 #define HIDPGETCONNLIST _IOR('H', 210, int)
 #define HIDPGETCONNINFO _IOR('H', 211, int)
-struct mtd_oob_buf32 {
-        u_int32_t start;
-        u_int32_t length;
-        compat_caddr_t ptr;     /* unsigned char* */
-};
-#define MEMWRITEOOB32   _IOWR('M',3,struct mtd_oob_buf32)
-#define MEMREADOOB32    _IOWR('M',4,struct mtd_oob_buf32)
-static int mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct mtd_oob_buf __user *buf = compat_alloc_user_space(sizeof(*buf));
-        struct mtd_oob_buf32 __user *buf32 = compat_ptr(arg);
-        u32 data;
-        char __user *datap;
-        unsigned int real_cmd;
-        int err;
-        real_cmd = (cmd == MEMREADOOB32) ?
-                MEMREADOOB : MEMWRITEOOB;
-        if (copy_in_user(&buf->start, &buf32->start,
-                         2 * sizeof(u32)) ||
-            get_user(data, &buf32->ptr))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &buf->ptr))
-                return -EFAULT;
-        err = sys_ioctl(fd, real_cmd, (unsigned long) buf);
-        if (!err) {
-                if (copy_in_user(&buf32->start, &buf->start,
-                                 2 * sizeof(u32)))
-                        err = -EFAULT;
-        }
-        return err;
-}       
 #ifdef CONFIG_BLOCK
 struct raw32_config_request
 {
@@ -1765,7 +1720,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
 * for some operations; this forces use of the newer bridge-utils that
- * use compatiable ioctls
+ * use compatible ioctls
 */
 static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
@@ -1826,6 +1781,41 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
        return sys_ioctl(fd, cmd, (unsigned long)tn);
 }
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+struct space_resv_32 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start __attribute__((packed));
+                        /* len == 0 means until end of file */
+        __s64           l_len __attribute__((packed));
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area */
+};
+#define FS_IOC_RESVSP_32                _IOW ('X', 40, struct space_resv_32)
+#define FS_IOC_RESVSP64_32      _IOW ('X', 42, struct space_resv_32)
+/* just account for different alignment */
+static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
+{
+        struct space_resv_32    __user *p32 = (void __user *)arg;
+        struct space_resv       __user *p = compat_alloc_user_space(sizeof(*p));
+        if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
+            copy_in_user(&p->l_whence,  &p32->l_whence, sizeof(s16)) ||
+            copy_in_user(&p->l_start,   &p32->l_start,  sizeof(s64)) ||
+            copy_in_user(&p->l_len,     &p32->l_len,    sizeof(s64)) ||
+            copy_in_user(&p->l_sysid,   &p32->l_sysid,  sizeof(s32)) ||
+            copy_in_user(&p->l_pid,     &p32->l_pid,    sizeof(u32)) ||
+            copy_in_user(&p->l_pad,     &p32->l_pad,    4*sizeof(u32)))
+                return -EFAULT;
+        return ioctl_preallocate(file, p);
+}
+#endif
 typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
                                        unsigned long, struct file *);
@@ -1915,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
 COMPATIBLE_IOCTL(FIOASYNC)
 COMPATIBLE_IOCTL(FIONBIO)
 COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
@@ -2432,15 +2423,6 @@ COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32)
 COMPATIBLE_IOCTL(USBDEVFS_REAPURB32)
 COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32)
 COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT)
-/* MTD */
-COMPATIBLE_IOCTL(MEMGETINFO)
-COMPATIBLE_IOCTL(MEMERASE)
-COMPATIBLE_IOCTL(MEMLOCK)
-COMPATIBLE_IOCTL(MEMUNLOCK)
-COMPATIBLE_IOCTL(MEMGETREGIONCOUNT)
-COMPATIBLE_IOCTL(MEMGETREGIONINFO)
-COMPATIBLE_IOCTL(MEMGETBADBLOCK)
-COMPATIBLE_IOCTL(MEMSETBADBLOCK)
 /* NBD */
 ULONG_IOCTL(NBD_SET_SOCK)
 ULONG_IOCTL(NBD_SET_BLKSIZE)
@@ -2550,8 +2532,6 @@ COMPATIBLE_IOCTL(JSIOCGBUTTONS)
 COMPATIBLE_IOCTL(JSIOCGNAME(0))
 /* now things that need handlers */
-HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
-HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
 #ifdef CONFIG_NET
 HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
 HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
@@ -2814,6 +2794,18 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
        case FIOQSIZE:
                break;
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+        case FS_IOC_RESVSP_32:
+        case FS_IOC_RESVSP64_32:
+                error = compat_ioctl_preallocate(filp, arg);
+                goto out_fput;
+#else
+        case FS_IOC_RESVSP:
+        case FS_IOC_RESVSP64:
+                error = ioctl_preallocate(filp, (void __user *)arg);
+                goto out_fput;
+#endif
        case FIBMAP:
        case FIGETBSZ:
        case FIONREAD:
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 33a90120f6ad..4d74fc72c195 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -67,6 +67,8 @@ static int debugfs_u8_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
 /**
 * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
@@ -95,6 +97,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
 struct dentry *debugfs_create_u8(const char *name, mode_t mode,
                                 struct dentry *parent, u8 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u8);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -110,6 +119,8 @@ static int debugfs_u16_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
 /**
 * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
@@ -138,6 +149,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
 struct dentry *debugfs_create_u16(const char *name, mode_t mode,
                                  struct dentry *parent, u16 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u16);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -153,6 +171,8 @@ static int debugfs_u32_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
 /**
 * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
@@ -181,6 +201,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
 struct dentry *debugfs_create_u32(const char *name, mode_t mode,
                                 struct dentry *parent, u32 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u32);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -197,6 +224,8 @@ static int debugfs_u64_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
 /**
 * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
@@ -225,15 +254,28 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
 struct dentry *debugfs_create_u64(const char *name, mode_t mode,
                                 struct dentry *parent, u64 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u64);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u64);
 DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
 /*
 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
@@ -256,6 +298,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 struct dentry *debugfs_create_x8(const char *name, mode_t mode,
                                 struct dentry *parent, u8 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_x8);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -273,6 +322,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
 struct dentry *debugfs_create_x16(const char *name, mode_t mode,
                                 struct dentry *parent, u16 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_x16);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -290,6 +346,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
 struct dentry *debugfs_create_x32(const char *name, mode_t mode,
                                 struct dentry *parent, u32 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_x32);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -419,7 +482,7 @@ static const struct file_operations fops_blob = {
 };
 /**
- * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob
+ * debugfs_create_blob - create a debugfs file that is used to read a binary blob
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 0662ba6de85a..d22438ef7674 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -403,6 +403,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
                }
                child = list_entry(parent->d_subdirs.next, struct dentry,
                                d_u.d_child);
+ next_sibling:
                /*
                 * If "child" isn't empty, walk down the tree and
@@ -417,6 +418,16 @@ void debugfs_remove_recursive(struct dentry *dentry)
                __debugfs_remove(child, parent);
                if (parent->d_subdirs.next == &child->d_u.d_child) {
                        /*
+                         * Try the next sibling.
+                         */
+                        if (child->d_u.d_child.next != &parent->d_subdirs) {
+                                child = list_entry(child->d_u.d_child.next,
+                                                   struct dentry,
+                                                   d_u.d_child);
+                                goto next_sibling;
+                        }
+                        /*
                         * Avoid infinite loop if we fail to remove
                         * one dentry.
                         */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 9b1d285f9fe6..75efb028974b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -423,7 +423,6 @@ static void devpts_kill_sb(struct super_block *sb)
 }
 static struct file_system_type devpts_fs_type = {
-        .owner          = THIS_MODULE,
        .name           = "devpts",
        .get_sb         = devpts_get_sb,
        .kill_sb        = devpts_kill_sb,
@@ -564,13 +563,4 @@ static int __init init_devpts_fs(void)
        }
        return err;
 }
-static void __exit exit_devpts_fs(void)
-{
-        unregister_filesystem(&devpts_fs_type);
-        mntput(devpts_mnt);
-}
 module_init(init_devpts_fs)
-module_exit(exit_devpts_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 205ec95b347e..eb507c453c5f 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -435,7 +435,7 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
 static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
                    unsigned int flags, struct dlm_rsb **r_ret)
 {
-        struct dlm_rsb *r, *tmp;
+        struct dlm_rsb *r = NULL, *tmp;
        uint32_t hash, bucket;
        int error = -EINVAL;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index cdb580a9c7a2..618a60f03886 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -902,7 +902,7 @@ static void tcp_connect_to_sock(struct connection *con)
        int result = -EHOSTUNREACH;
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
-        struct socket *sock;
+        struct socket *sock = NULL;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -962,6 +962,8 @@ out_err:
        if (con->sock) {
                sock_release(con->sock);
                con->sock = NULL;
+        } else if (sock) {
+                sock_release(sock);
        }
        /*
         * Some errors are fatal and this list might need adjusting. For other
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 894a32d438d5..16f682e26c07 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -353,7 +353,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 {
        struct dlm_plock_info info;
        struct plock_op *op;
-        int found = 0;
+        int found = 0, do_callback = 0;
        if (count != sizeof(info))
                return -EINVAL;
@@ -366,21 +366,24 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
        spin_lock(&ops_lock);
        list_for_each_entry(op, &recv_list, list) {
-                if (op->info.fsid == info.fsid && op->info.number == info.number &&
+                if (op->info.fsid == info.fsid &&
+                    op->info.number == info.number &&
                    op->info.owner == info.owner) {
+                        struct plock_xop *xop = (struct plock_xop *)op;
                        list_del_init(&op->list);
-                        found = 1;
-                        op->done = 1;
                        memcpy(&op->info, &info, sizeof(info));
+                        if (xop->callback)
+                                do_callback = 1;
+                        else
+                                op->done = 1;
+                        found = 1;
                        break;
                }
        }
        spin_unlock(&ops_lock);
        if (found) {
-                struct plock_xop *xop;
+                if (do_callback)
-                xop = (struct plock_xop *)op;
-                if (xop->callback)
                        dlm_plock_callback(op);
                else
                        wake_up(&recv_wq);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b6a719a909f8..a2edb7913447 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb)
                        continue;
                __iget(inode);
                spin_unlock(&inode_lock);
-                __invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
+                invalidate_mapping_pages(inode->i_mapping, 0, -1);
                iput(toput_inode);
                toput_inode = inode;
                spin_lock(&inode_lock);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
        }
        (*new_auth_tok)->session_key.encrypted_key_size =
                (body_size - (ECRYPTFS_SALT_SIZE + 5));
+        if ((*new_auth_tok)->session_key.encrypted_key_size
+            > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+                printk(KERN_WARNING "Tag 3 packet contains key larger "
+                       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+                rc = -EINVAL;
+                goto out_free;
+        }
        if (unlikely(data[(*packet_size)++] != 0x04)) {
                printk(KERN_WARNING "Unknown version number [%d]\n",
                       data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
                rc = -EINVAL;
                goto out;
        }
+        if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+                printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+                       "expected size\n");
+                rc = -EINVAL;
+                goto out;
+        }
        if (data[(*packet_size)++] != 0x62) {
                printk(KERN_WARNING "Unrecognizable packet\n");
                rc = -EINVAL;
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 49308a29798a..7ee6f7e3a608 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -5,12 +5,12 @@
 */
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "efs.h"
 static int efs_readdir(struct file *, void *, filldir_t);
 const struct file_operations efs_dir_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = efs_readdir,
 };
@@ -33,8 +33,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
        if (inode->i_size & (EFS_DIRBSIZE-1))
                printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
-        lock_kernel();
        /* work out where this entry can be found */
        block = filp->f_pos >> EFS_DIRBSIZE_BITS;
@@ -107,7 +105,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
        filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
 out:
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index c3fb5f9c4a44..1511bf9e5f80 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -8,7 +8,6 @@
 #include <linux/buffer_head.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/exportfs.h>
 #include "efs.h"
@@ -63,16 +62,12 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
        efs_ino_t inodenum;
        struct inode * inode = NULL;
-        lock_kernel();
        inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
        if (inodenum) {
                inode = efs_iget(dir->i_sb, inodenum);
-                if (IS_ERR(inode)) {
+                if (IS_ERR(inode))
-                        unlock_kernel();
                        return ERR_CAST(inode);
-                }
        }
-        unlock_kernel();
        return d_splice_alias(inode, dentry);
 }
@@ -115,11 +110,9 @@ struct dentry *efs_get_parent(struct dentry *child)
        struct dentry *parent = ERR_PTR(-ENOENT);
        efs_ino_t ino;
-        lock_kernel();
        ino = efs_find_entry(child->d_inode, "..", 2);
        if (ino)
                parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
-        unlock_kernel();
        return parent;
 }
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 41911ec83aaf..75117d0dac2b 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -9,7 +9,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "efs.h"
 static int efs_symlink_readpage(struct file *file, struct page *page)
@@ -22,9 +21,8 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
  
        err = -ENAMETOOLONG;
        if (size > 2 * EFS_BLOCKSIZE)
-                goto fail_notlocked;
+                goto fail;
  
-        lock_kernel();
        /* read first 512 bytes of link target */
        err = -EIO;
        bh = sb_bread(inode->i_sb, efs_bmap(inode, 0));
@@ -40,14 +38,11 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
                brelse(bh);
        }
        link[size] = '\0';
-        unlock_kernel();
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 fail:
-        unlock_kernel();
-fail_notlocked:
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3f0e1974abdc..31d12de83a2a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -14,35 +14,44 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 struct eventfd_ctx {
+        struct kref kref;
        wait_queue_head_t wqh;
        /*
         * Every time that a write(2) is performed on an eventfd, the
         * value of the __u64 being written is added to "count" and a
         * wakeup is performed on "wqh". A read(2) will return the "count"
         * value to userspace, and will reset "count" to zero. The kernel
-         * size eventfd_signal() also, adds to the "count" counter and
+         * side eventfd_signal() also, adds to the "count" counter and
         * issue a wakeup.
         */
        __u64 count;
        unsigned int flags;
 };
-/*
+/**
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
+ * eventfd_signal - Adds @n to the eventfd counter.
- * success, or a value lower then "n" in case of coutner overflow.
+ * @ctx: [in] Pointer to the eventfd context.
- * This function is supposed to be called by the kernel in paths
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * that do not allow sleeping. In this function we allow the counter
+ *          The value cannot be negative.
- * to reach the ULLONG_MAX value, and we signal this as overflow
+ *
- * condition by returining a POLLERR to poll(2).
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
 */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-        struct eventfd_ctx *ctx = file->private_data;
        unsigned long flags;
        if (n < 0)
@@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, int n)
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
+static void eventfd_free(struct kref *kref)
+{
+        struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+        kfree(ctx);
+}
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+        kref_get(&ctx->kref);
+        return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+        kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-        kfree(file->private_data);
+        struct eventfd_ctx *ctx = file->private_data;
+        wake_up_poll(&ctx->wqh, POLLHUP);
+        eventfd_ctx_put(ctx);
        return 0;
 }
@@ -185,6 +230,16 @@ static const struct file_operations eventfd_fops = {
        .write          = eventfd_write,
 };
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
        struct file *file;
@@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+        struct file *file;
+        struct eventfd_ctx *ctx;
+        file = eventfd_fget(fd);
+        if (IS_ERR(file))
+                return (struct eventfd_ctx *) file;
+        ctx = eventfd_ctx_get(file->private_data);
+        fput(file);
+        return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+        if (file->f_op != &eventfd_fops)
+                return ERR_PTR(-EINVAL);
+        return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
        int fd;
@@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
        if (!ctx)
                return -ENOMEM;
+        kref_init(&ctx->kref);
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
        ctx->flags = flags;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5458e80fc558..085c5c063420 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -98,7 +98,7 @@ struct epoll_filefd {
 struct nested_call_node {
        struct list_head llink;
        void *cookie;
-        int cpu;
+        void *ctx;
 };
 /*
@@ -317,17 +317,17 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
 * @nproc: Nested call core function pointer.
 * @priv: Opaque data to be passed to the @nproc callback.
 * @cookie: Cookie to be used to identify this nested call.
+ * @ctx: This instance context.
 *
 * Returns: Returns the code returned by the @nproc callback, or -1 if
 *          the maximum recursion limit has been exceeded.
 */
 static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
                          int (*nproc)(void *, void *, int), void *priv,
-                          void *cookie)
+                          void *cookie, void *ctx)
 {
        int error, call_nests = 0;
        unsigned long flags;
-        int this_cpu = get_cpu();
        struct list_head *lsthead = &ncalls->tasks_call_list;
        struct nested_call_node *tncur;
        struct nested_call_node tnode;
@@ -340,7 +340,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
         * very much limited.
         */
        list_for_each_entry(tncur, lsthead, llink) {
-                if (tncur->cpu == this_cpu &&
+                if (tncur->ctx == ctx &&
                    (tncur->cookie == cookie || ++call_nests > max_nests)) {
                        /*
                         * Ops ... loop detected or maximum nest level reached.
@@ -352,7 +352,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
        }
        /* Add the current task and cookie to the list */
-        tnode.cpu = this_cpu;
+        tnode.ctx = ctx;
        tnode.cookie = cookie;
        list_add(&tnode.llink, lsthead);
@@ -364,10 +364,9 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
        /* Remove the current task from the list */
        spin_lock_irqsave(&ncalls->lock, flags);
        list_del(&tnode.llink);
- out_unlock:
+out_unlock:
        spin_unlock_irqrestore(&ncalls->lock, flags);
-        put_cpu();
        return error;
 }
@@ -408,8 +407,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
+        int this_cpu = get_cpu();
        ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
-                       ep_poll_wakeup_proc, NULL, wq);
+                       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+        put_cpu();
 }
 /*
@@ -663,7 +666,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
         * could re-enter here.
         */
        pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-                                   ep_poll_readyevents_proc, ep, ep);
+                                   ep_poll_readyevents_proc, ep, ep, current);
        return pollflags != -1 ? pollflags : 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index e639957d7a57..4a8849e45b21 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1277,8 +1277,8 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval < 0)
+        if (mutex_lock_interruptible(&current->cred_guard_mutex))
                goto out_free;
        current->in_execve = 1;
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index 24667eedc023..c6718e4817fe 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -2,9 +2,7 @@
 * common.h - Common definitions for both Kernel and user-mode utilities
 *
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 65b0c8c776a1..4cfab1cc75c0 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 0fd4c7859679..5ec72e020b22 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -156,6 +154,9 @@ ino_t exofs_parent_ino(struct dentry *child);
 int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
                    struct inode *);
+/* super.c               */
+int exofs_sync_fs(struct super_block *sb, int wait);
 /*********************
 * operation vectors *
 *********************/
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 6ed7fe484752..839b9dc1e70f 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -47,16 +45,23 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 {
        int ret;
        struct address_space *mapping = filp->f_mapping;
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb;
        ret = filemap_write_and_wait(mapping);
        if (ret)
                return ret;
-        /*Note: file_fsync below also calles sync_blockdev, which is a no-op
+        /* sync the inode attributes */
-         *      for exofs, but other then that it does sync_inode and
+        ret = write_inode_now(inode, 1);
-         *      sync_superblock which is what we need here.
-         */
+        /* This is a good place to write the sb */
-        return file_fsync(filp, dentry, datasync);
+        /* TODO: Sechedule an sb-sync on create */
+        sb = inode->i_sb;
+        if (sb->s_dirt)
+                exofs_sync_fs(sb, 1);
+        return ret;
 }
 static int exofs_flush(struct file *file, fl_owner_t id)
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 77d0a295eb1c..6c10f7476699 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -295,6 +293,9 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 err:
        if (!is_sync)
                _unlock_pcol_pages(pcol, ret, READ);
+        else /* Pages unlocked by caller in sync mode only free bio */
+                pcol_free(pcol);
        kfree(pcol_copy);
        if (or)
                osd_end_request(or);
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 77fdd765e76d..b7dd0c236863 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b3d2ccb87aaa..4372542df284 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8216c5b77b53..5ab10c3bbebe 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -33,6 +31,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/vfs.h>
@@ -200,7 +199,7 @@ static const struct export_operations exofs_export_ops;
 /*
 * Write the superblock to the OSD
 */
-static int exofs_sync_fs(struct super_block *sb, int wait)
+int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
index 36e2d7bc7f7b..4dd687c3e747 100644
--- a/fs/exofs/symlink.c
+++ b/fs/exofs/symlink.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d46e38cb85c5..d636e1297cad 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -125,37 +125,12 @@ fail:
        return ERR_PTR(-EINVAL);
 }
-static inline struct posix_acl *
-ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = EXT2_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT2_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static inline void
-ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                   struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT2_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 /*
 * inode->i_mutex: don't care
 */
 static struct posix_acl *
 ext2_get_acl(struct inode *inode, int type)
 {
-        struct ext2_inode_info *ei = EXT2_I(inode);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
@@ -164,23 +139,19 @@ ext2_get_acl(struct inode *inode, int type)
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return NULL;
-        switch(type) {
+        acl = get_cached_acl(inode, type);
-                case ACL_TYPE_ACCESS:
+        if (acl != ACL_NOT_CACHED)
-                        acl = ext2_iget_acl(inode, &ei->i_acl);
+                return acl;
-                        if (acl != EXT2_ACL_NOT_CACHED)
-                                return acl;
+        switch (type) {
-                        name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+        case ACL_TYPE_ACCESS:
-                        break;
+                name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
-                case ACL_TYPE_DEFAULT:
+        case ACL_TYPE_DEFAULT:
-                        acl = ext2_iget_acl(inode, &ei->i_default_acl);
+                name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
-                        if (acl != EXT2_ACL_NOT_CACHED)
+                break;
-                                return acl;
+        default:
-                        name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                BUG();
-                        break;
-                default:
-                        return ERR_PTR(-EINVAL);
        }
        retval = ext2_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
@@ -197,17 +168,9 @@ ext2_get_acl(struct inode *inode, int type)
                acl = ERR_PTR(retval);
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                        case ACL_TYPE_ACCESS:
-                                ext2_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
-                                ext2_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
-                }
-        }
        return acl;
 }
@@ -217,7 +180,6 @@ ext2_get_acl(struct inode *inode, int type)
 static int
 ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
-        struct ext2_inode_info *ei = EXT2_I(inode);
        int name_index;
        void *value = NULL;
        size_t size = 0;
@@ -263,17 +225,8 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        error = ext2_xattr_set(inode, name_index, "", value, size, 0);
        kfree(value);
-        if (!error) {
+        if (!error)
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                        case ACL_TYPE_ACCESS:
-                                ext2_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
-                                ext2_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
-                }
-        }
        return error;
 }
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index b42cf578554b..ecefe478898f 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -53,10 +53,6 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
-/* Value for inode->u.ext2_i.i_acl and inode->u.ext2_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT2_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
 extern int ext2_permission (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 003500498c22..6cde970b0a1a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -450,7 +450,7 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
-                        struct page *page, struct inode *inode)
+                   struct page *page, struct inode *inode, int update_times)
 {
        loff_t pos = page_offset(page) +
                        (char *) de - (char *) page_address(page);
@@ -465,7 +465,8 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
        ext2_set_de_type(de, inode);
        err = ext2_commit_chunk(page, pos, len);
        ext2_put_page(page);
-        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+        if (update_times)
+                dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
        EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
        mark_inode_dirty(dir);
 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index b2bbf45039e0..9a8a8e27a063 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -27,7 +27,7 @@ struct ext2_inode_info {
        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
-         * it is ued for making block allocation decisions - we try to
+         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
@@ -47,10 +47,6 @@ struct ext2_inode_info {
         */
        struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        struct posix_acl        *i_acl;
-        struct posix_acl        *i_default_acl;
-#endif
        rwlock_t i_meta_lock;
        /*
@@ -111,7 +107,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
-extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
+extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 29ed682061f6..e27130341d4f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1224,10 +1224,6 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT2_I(inode);
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        ei->i_acl = EXT2_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT2_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 7cb4badef927..e7431309bdca 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -13,7 +13,6 @@
 #include <linux/sched.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 90ea17998a73..e1dedb0f7873 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -66,8 +66,16 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (IS_ERR(inode))
+                if (unlikely(IS_ERR(inode))) {
-                        return ERR_CAST(inode);
+                        if (PTR_ERR(inode) == -ESTALE) {
+                                ext2_error(dir->i_sb, __func__,
+                                                "deleted inode referenced: %lu",
+                                                ino);
+                                return ERR_PTR(-EIO);
+                        } else {
+                                return ERR_CAST(inode);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -320,7 +328,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                if (!new_de)
                        goto out_dir;
                inode_inc_link_count(old_inode);
-                ext2_set_link(new_dir, new_de, new_page, old_inode);
+                ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
                        drop_nlink(new_inode);
@@ -352,7 +360,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        inode_dec_link_count(old_inode);
        if (dir_de) {
-                ext2_set_link(old_inode, dir_de, dir_page, new_dir);
+                if (old_dir != new_dir)
+                        ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
                inode_dec_link_count(old_dir);
        }
        return 0;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 458999638c3d..1a9ffee47d56 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -152,10 +152,6 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
        ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        ei->i_acl = EXT2_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT2_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
        return &ei->vfs_inode;
@@ -198,18 +194,6 @@ static void destroy_inodecache(void)
 static void ext2_clear_inode(struct inode *inode)
 {
        struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        struct ext2_inode_info *ei = EXT2_I(inode);
-        if (ei->i_acl && ei->i_acl != EXT2_ACL_NOT_CACHED) {
-                posix_acl_release(ei->i_acl);
-                ei->i_acl = EXT2_ACL_NOT_CACHED;
-        }
-        if (ei->i_default_acl && ei->i_default_acl != EXT2_ACL_NOT_CACHED) {
-                posix_acl_release(ei->i_default_acl);
-                ei->i_default_acl = EXT2_ACL_NOT_CACHED;
-        }
-#endif
        ext2_discard_reservation(inode);
        EXT2_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index d81ef2fdb08e..e167bae37ef0 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -126,30 +126,6 @@ fail:
        return ERR_PTR(-EINVAL);
 }
-static inline struct posix_acl *
-ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT3_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static inline void
-ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                  struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT3_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 /*
 * Inode operation get_posix_acl().
 *
@@ -158,7 +134,6 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
 {
-        struct ext3_inode_info *ei = EXT3_I(inode);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
@@ -167,24 +142,21 @@ ext3_get_acl(struct inode *inode, int type)
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return NULL;
-        switch(type) {
+        acl = get_cached_acl(inode, type);
-                case ACL_TYPE_ACCESS:
+        if (acl != ACL_NOT_CACHED)
-                        acl = ext3_iget_acl(inode, &ei->i_acl);
+                return acl;
-                        if (acl != EXT3_ACL_NOT_CACHED)
-                                return acl;
+        switch (type) {
-                        name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+        case ACL_TYPE_ACCESS:
-                        break;
+                name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
-                case ACL_TYPE_DEFAULT:
+        case ACL_TYPE_DEFAULT:
-                        acl = ext3_iget_acl(inode, &ei->i_default_acl);
+                name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
-                        if (acl != EXT3_ACL_NOT_CACHED)
+                break;
-                                return acl;
+        default:
-                        name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                BUG();
-                        break;
-                default:
-                        return ERR_PTR(-EINVAL);
        }
        retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
                value = kmalloc(retval, GFP_NOFS);
@@ -200,17 +172,9 @@ ext3_get_acl(struct inode *inode, int type)
                acl = ERR_PTR(retval);
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                        case ACL_TYPE_ACCESS:
-                                ext3_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
-                                ext3_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
-                }
-        }
        return acl;
 }
@@ -223,7 +187,6 @@ static int
 ext3_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl)
 {
-        struct ext3_inode_info *ei = EXT3_I(inode);
        int name_index;
        void *value = NULL;
        size_t size = 0;
@@ -268,17 +231,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
                                      value, size, 0);
        kfree(value);
-        if (!error) {
-                switch(type) {
-                        case ACL_TYPE_ACCESS:
-                                ext3_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
+        if (!error)
-                                ext3_iset_acl(inode, &ei->i_default_acl, acl);
+                set_cached_acl(inode, type, acl);
-                                break;
-                }
-        }
        return error;
 }
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 42da16b8cac0..07d15a3a5969 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -53,10 +53,6 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT3_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
 extern int ext3_permission (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext3_get_blocks_handle(NULL, inode, blk, 1,
+                err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
-                                                &map_bh, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b0248c6d5d4c..b49908a167ae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -788,7 +788,7 @@ err_out:
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                sector_t iblock, unsigned long maxblocks,
                struct buffer_head *bh_result,
-                int create, int extend_disksize)
+                int create)
 {
        int err = -EIO;
        int offsets[4];
@@ -820,7 +820,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                while (count < maxblocks && count <= blocks_to_boundary) {
                        ext3_fsblk_t blk;
-                        if (!verify_chain(chain, partial)) {
+                        if (!verify_chain(chain, chain + depth - 1)) {
                                /*
                                 * Indirect block might be removed by
                                 * truncate while we were reading it.
@@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext3_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-        /*
-         * i_disksize growing is protected by truncate_mutex.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext3_get_block() -bzzz
-        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-                ei->i_disksize = inode->i_size;
        mutex_unlock(&ei->truncate_mutex);
        if (err)
                goto cleanup;
@@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext3_get_blocks_handle(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext3_get_blocks_handle(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create);
        /*
         * ext3_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1193,15 +1186,16 @@ write_begin_failed:
                 * i_size_read because we hold i_mutex.
                 *
                 * Add inode to orphan list in case we crash before truncate
-                 * finishes.
+                 * finishes. Do this only if ext3_can_truncate() agrees so
+                 * that orphan processing code is happy.
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext3_can_truncate(inode))
                        ext3_orphan_add(handle, inode);
                ext3_journal_stop(handle);
                unlock_page(page);
                page_cache_release(page);
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext3_truncate(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret2 = ext3_journal_stop(handle);
        if (!ret)
@@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret = ext3_journal_stop(handle);
        unlock_page(page);
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -2374,7 +2368,7 @@ void ext3_truncate(struct inode *inode)
        struct page *page;
        if (!ext3_can_truncate(inode))
-                return;
+                goto out_notrans;
        if (inode->i_size == 0 && ext3_should_writeback_data(inode))
                ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
@@ -2390,7 +2384,7 @@ void ext3_truncate(struct inode *inode)
                page = grab_cache_page(mapping,
                                inode->i_size >> PAGE_CACHE_SHIFT);
                if (!page)
-                        return;
+                        goto out_notrans;
        }
        handle = start_transaction(inode);
@@ -2401,7 +2395,7 @@ void ext3_truncate(struct inode *inode)
                        unlock_page(page);
                        page_cache_release(page);
                }
-                return;         /* AKPM: return what? */
+                goto out_notrans;
        }
        last_block = (inode->i_size + blocksize-1)
@@ -2525,6 +2519,14 @@ out_stop:
                ext3_orphan_del(handle, inode);
        ext3_journal_stop(handle);
+        return;
+out_notrans:
+        /*
+         * Delete the inode from orphan list so that it doesn't stay there
+         * forever and trigger assertion on umount.
+         */
+        if (inode->i_nlink)
+                ext3_orphan_del(NULL, inode);
 }
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2744,10 +2746,6 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT3_I(inode);
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-        ei->i_acl = EXT3_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        ret = __ext3_get_inode_loc(inode, &iloc, 0);
@@ -3122,12 +3120,6 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
        rc = inode_setattr(inode, attr);
-        /* If inode_setattr's call to ext3_truncate failed to get a
-         * transaction handle at all, we need to clean up the in-core
-         * orphan list manually. */
-        if (inode->i_nlink)
-                ext3_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
                rc = ext3_acl_chmod(inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8a0b26340b54..8359e7b3dc89 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -990,7 +990,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
                        ext3_warning(sb, __func__,
-                        "CONFIG_LBD not enabled\n");
+                        "CONFIG_LBDAF not enabled\n");
                return -EINVAL;
        }
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 26aa64dee6aa..524b349c6299 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -464,10 +464,6 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
        ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-        ei->i_acl = EXT3_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
        return &ei->vfs_inode;
@@ -518,18 +514,6 @@ static void destroy_inodecache(void)
 static void ext3_clear_inode(struct inode *inode)
 {
        struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-        if (EXT3_I(inode)->i_acl &&
-                        EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-                posix_acl_release(EXT3_I(inode)->i_acl);
-                EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
-        }
-        if (EXT3_I(inode)->i_default_acl &&
-                        EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-                posix_acl_release(EXT3_I(inode)->i_default_acl);
-                EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
-        }
-#endif
        ext3_discard_reservation(inode);
        EXT3_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
@@ -1812,7 +1796,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
                        " too large to mount safely\n", sb->s_id);
                if (sizeof(sector_t) < 8)
-                        printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+                        printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
                                        "enabled\n");
                goto failed_mount;
        }
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8a34710ecf40..8867b2a1e5fe 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                ext4_jbd2.o migrate.o mballoc.o block_validity.o
+                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 647e0d65a284..f6d8967149ca 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -126,30 +126,6 @@ fail:
        return ERR_PTR(-EINVAL);
 }
-static inline struct posix_acl *
-ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT4_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static inline void
-ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT4_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 /*
 * Inode operation get_posix_acl().
 *
@@ -158,7 +134,6 @@ ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 static struct posix_acl *
 ext4_get_acl(struct inode *inode, int type)
 {
-        struct ext4_inode_info *ei = EXT4_I(inode);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
@@ -167,23 +142,19 @@ ext4_get_acl(struct inode *inode, int type)
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return NULL;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
-                acl = ext4_iget_acl(inode, &ei->i_acl);
-                if (acl != EXT4_ACL_NOT_CACHED)
-                        return acl;
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
                break;
        case ACL_TYPE_DEFAULT:
-                acl = ext4_iget_acl(inode, &ei->i_default_acl);
-                if (acl != EXT4_ACL_NOT_CACHED)
-                        return acl;
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
        retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
@@ -200,17 +171,9 @@ ext4_get_acl(struct inode *inode, int type)
                acl = ERR_PTR(retval);
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch (type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        ext4_iset_acl(inode, &ei->i_acl, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        ext4_iset_acl(inode, &ei->i_default_acl, acl);
-                        break;
-                }
-        }
        return acl;
 }
@@ -223,7 +186,6 @@ static int
 ext4_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl)
 {
-        struct ext4_inode_info *ei = EXT4_I(inode);
        int name_index;
        void *value = NULL;
        size_t size = 0;
@@ -268,17 +230,9 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
                                      value, size, 0);
        kfree(value);
-        if (!error) {
+        if (!error)
-                switch (type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        ext4_iset_acl(inode, &ei->i_acl, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        ext4_iset_acl(inode, &ei->i_default_acl, acl);
-                        break;
-                }
-        }
        return error;
 }
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cb45257a246e..949789d2bba6 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -53,10 +53,6 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT4_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
 extern int ext4_permission(struct inode *, int);
 extern int ext4_acl_chmod(struct inode *);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cc7d5edc38c9..9714db393efe 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -93,20 +93,20 @@ typedef unsigned int ext4_group_t;
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
+        /* how many blocks we want to allocate */
+        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
-        /* phys. target (a hint) */
-        ext4_fsblk_t goal;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
-        /* phys. block for ^^^ */
-        ext4_fsblk_t pleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
-        /* phys. block for ^^^ */
+        /* phys. target (a hint) */
+        ext4_fsblk_t goal;
+        /* phys. block for the closest logical allocated block to the left */
+        ext4_fsblk_t pleft;
+        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
-        /* how many blocks we want to allocate */
-        unsigned int len;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
 };
@@ -352,6 +352,7 @@ struct ext4_new_group_data {
 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
+#define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
 /*
 * ioctl commands in 32 bit emulation
@@ -447,6 +448,15 @@ struct ext4_inode {
        __le32  i_version_hi;   /* high 32 bits for 64-bit version */
 };
+struct move_extent {
+        __u32 reserved;         /* should be zero */
+        __u32 donor_fd;         /* donor file descriptor */
+        __u64 orig_start;       /* logical start offset in block for orig */
+        __u64 donor_start;      /* logical start offset in block for donor */
+        __u64 len;              /* block length to be moved */
+        __u64 moved_len;        /* moved block length */
+};
+#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -585,10 +595,6 @@ struct ext4_inode_info {
         */
        struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        struct posix_acl        *i_acl;
-        struct posix_acl        *i_default_acl;
-#endif
        struct list_head i_orphan;      /* unlinked but open inodes */
@@ -674,7 +680,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_PANIC         0x00040 /* Panic on errors */
 #define EXT4_MOUNT_MINIX_DF             0x00080 /* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD               0x00100 /* Don't use existing journal*/
-#define EXT4_MOUNT_ABORT                0x00200 /* Fatal error detected */
 #define EXT4_MOUNT_DATA_FLAGS           0x00C00 /* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA         0x00400 /* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA         0x00800 /* Flush data before commit */
@@ -696,17 +701,10 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
-/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
-#ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
-#else
-#define EXT2_MOUNT_NOLOAD               EXT4_MOUNT_NOLOAD
-#define EXT2_MOUNT_ABORT                EXT4_MOUNT_ABORT
-#define EXT2_MOUNT_DATA_FLAGS           EXT4_MOUNT_DATA_FLAGS
-#endif
 #define ext4_set_bit                    ext2_set_bit
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
@@ -824,6 +822,13 @@ struct ext4_super_block {
 };
 #ifdef __KERNEL__
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED  0x0001
+#define EXT4_MF_FS_ABORTED      0x0002  /* Fatal error detected */
 /*
 * fourth extended-fs super-block data in memory
 */
@@ -842,7 +847,8 @@ struct ext4_sb_info {
        struct buffer_head * s_sbh;     /* Buffer containing the super block */
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
        struct buffer_head **s_group_desc;
-        unsigned long  s_mount_opt;
+        unsigned int s_mount_opt;
+        unsigned int s_mount_flags;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
        gid_t s_resgid;
@@ -853,6 +859,7 @@ struct ext4_sb_info {
        int s_inode_size;
        int s_first_ino;
        unsigned int s_inode_readahead_blks;
+        unsigned int s_inode_goal;
        spinlock_t s_next_gen_lock;
        u32 s_next_generation;
        u32 s_hash_seed[4];
@@ -1305,7 +1312,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
                          dx_hash_info *hinfo);
 /* ialloc.c */
-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+                                    const struct qstr *qstr, __u32 goal);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1329,7 +1337,7 @@ extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
-                unsigned long, unsigned long, int, unsigned long *);
+                ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
@@ -1647,6 +1655,11 @@ extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
                           struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
+/* move_extent.c */
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+                             __u64 start_orig, __u64 start_donor,
+                             __u64 len, __u64 *moved_len);
 /*
 * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index f0c3ec85bd48..20a84105a10b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 }
 extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+                                      struct ext4_extent *ex1,
+                                      struct ext4_extent *ex2);
 extern int ext4_ext_try_to_merge(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index ad13a84644e1..eb27fd0f2ee8 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        }
+        else
+                brelse(bh);
        return err;
 }
@@ -57,6 +59,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        }
+        else
+                brelse(bh);
        return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index be2f426f6805..139fb8cb87e4 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -131,9 +131,11 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh);
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_forget(const char *where, handle_t *handle,
                                struct buffer_head *bh);
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_revoke(const char *where, handle_t *handle,
                                ext4_fsblk_t blocknr, struct buffer_head *bh);
@@ -281,10 +283,10 @@ static inline int ext4_should_order_data(struct inode *inode)
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-        if (EXT4_JOURNAL(inode) == NULL)
-                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 1;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2593f748c3a4..73ebfb44ad75 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -49,7 +49,7 @@
 * ext_pblock:
 * combine low and high parts of physical block number into ext4_fsblk_t
 */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
        ext4_fsblk_t block;
@@ -1417,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
        return err;
 }
-static int
+int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                                struct ext4_extent *ex2)
 {
@@ -1977,6 +1977,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                         */
                        /* 1 bitmap, 1 block group descriptor */
                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+                        return ret;
                }
        }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 588af8c77246..3f1873fef1c6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,6 +21,8 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -145,6 +147,38 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+        struct super_block *sb = inode->i_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct vfsmount *mnt = filp->f_path.mnt;
+        struct path path;
+        char buf[64], *cp;
+        if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+                     !(sb->s_flags & MS_RDONLY))) {
+                sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+                /*
+                 * Sample where the filesystem has been mounted and
+                 * store it in the superblock for sysadmin convenience
+                 * when trying to sort through large numbers of block
+                 * devices or filesystem images.
+                 */
+                memset(buf, 0, sizeof(buf));
+                path.mnt = mnt->mnt_parent;
+                path.dentry = mnt->mnt_mountpoint;
+                path_get(&path);
+                cp = d_path(&path, buf, sizeof(buf));
+                path_put(&path);
+                if (!IS_ERR(cp)) {
+                        memcpy(sbi->s_es->s_last_mounted, cp,
+                               sizeof(sbi->s_es->s_last_mounted));
+                        sb->s_dirt = 1;
+                }
+        }
+        return generic_file_open(inode, filp);
+}
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -156,7 +190,7 @@ const struct file_operations ext4_file_operations = {
        .compat_ioctl   = ext4_compat_ioctl,
 #endif
        .mmap           = ext4_file_mmap,
-        .open           = generic_file_open,
+        .open           = ext4_file_open,
        .release        = ext4_release_file,
        .fsync          = ext4_sync_file,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5afe4370840b..83cf6415f599 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,10 +28,12 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
 /*
 * akpm: A new design for ext4_sync_file().
 *
@@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+        trace_ext4_sync_file(file, dentry, datasync);
-                   inode->i_sb->s_id, datasync, inode->i_ino,
-                   dentry->d_parent->d_inode->i_ino);
        /*
         * data=writeback:
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3743bd849bce..29e6dc7299b8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -23,11 +23,14 @@
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <asm/byteorder.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include <trace/events/ext4.h>
 /*
 * ialloc.c contains the inodes allocation and deallocation routines
 */
@@ -208,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
-        trace_mark(ext4_free_inode,
+        trace_ext4_free_inode(inode);
-                   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
-                   sb->s_id, inode->i_ino, inode->i_mode,
-                   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
-                   (unsigned long long) inode->i_blocks);
        /*
         * Note: we must free any quota before locking the superblock,
@@ -471,7 +470,8 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 */
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-                            ext4_group_t *group, int mode)
+                            ext4_group_t *group, int mode,
+                            const struct qstr *qstr)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -486,6 +486,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
+        struct dx_hash_info hinfo;
        ngroups = real_ngroups;
        if (flex_size > 1) {
@@ -507,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                int best_ndir = inodes_per_group;
                int ret = -1;
-                get_random_bytes(&grp, sizeof(grp));
+                if (qstr) {
+                        hinfo.hash_version = DX_HASH_HALF_MD4;
+                        hinfo.seed = sbi->s_hash_seed;
+                        ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+                        grp = hinfo.hash;
+                } else
+                        get_random_bytes(&grp, sizeof(grp));
                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
                        g = (parent_group + i) % ngroups;
@@ -650,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
-                return find_group_orlov(sb, parent, group, mode);
+                return find_group_orlov(sb, parent, group, mode, 0);
        }
        /*
@@ -791,7 +798,8 @@ err_ret:
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+                             const struct qstr *qstr, __u32 goal)
 {
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
@@ -815,14 +823,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        sb = dir->i_sb;
        ngroups = ext4_get_groups_count(sb);
-        trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+        trace_ext4_request_inode(dir, mode);
-                   dir->i_ino, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);
        sbi = EXT4_SB(sb);
+        if (!goal)
+                goal = sbi->s_inode_goal;
+        if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
+                group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+                ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+                ret2 = 0;
+                goto got_group;
+        }
        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
                if (ret2 == -1) {
@@ -841,7 +858,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                if (test_opt(sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
                else
-                        ret2 = find_group_orlov(sb, dir, &group, mode);
+                        ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
        } else
                ret2 = find_group_other(sb, dir, &group, mode);
@@ -851,7 +868,7 @@ got_group:
        if (ret2 == -1)
                goto out;
-        for (i = 0; i < ngroups; i++) {
+        for (i = 0; i < ngroups; i++, ino = 0) {
                err = -EIO;
                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -863,8 +880,6 @@ got_group:
                if (!inode_bitmap_bh)
                        goto fail;
-                ino = 0;
 repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
                                              inode_bitmap_bh->b_data,
@@ -1047,8 +1062,7 @@ got:
        }
        ext4_debug("allocating inode %lu\n", inode->i_ino);
-        trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+        trace_ext4_allocate_inode(inode, dir, mode);
-                   sb->s_id, inode->i_ino, dir->i_ino, mode);
        goto really_out;
 fail:
        ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 875db944b22f..f9c642b22efa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,11 +37,14 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "ext4_extents.h"
+#include <trace/events/ext4.h>
 #define MPAGE_DA_EXTENT_TAIL 0x01
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -75,22 +78,20 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
 *
- * If the handle isn't valid we're not journaling so there's nothing to do.
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
 */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                        struct buffer_head *bh, ext4_fsblk_t blocknr)
+                struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
-        if (!ext4_handle_valid(handle))
-                return 0;
        might_sleep();
        BUFFER_TRACE(bh, "enter");
        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                  "data mode %lx\n",
+                  "data mode %x\n",
                  bh, is_metadata, inode->i_mode,
                  test_opt(inode->i_sb, DATA_FLAGS));
@@ -329,8 +330,8 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 */
 static int ext4_block_to_path(struct inode *inode,
-                        ext4_lblk_t i_block,
+                              ext4_lblk_t i_block,
-                        ext4_lblk_t offsets[4], int *boundary)
+                              ext4_lblk_t offsets[4], int *boundary)
 {
        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -362,9 +363,9 @@ static int ext4_block_to_path(struct inode *inode,
                final = ptrs;
        } else {
                ext4_warning(inode->i_sb, "ext4_block_to_path",
-                                "block %lu > max in inode %lu",
+                             "block %lu > max in inode %lu",
-                                i_block + direct_blocks +
+                             i_block + direct_blocks +
-                                indirect_blocks + double_blocks, inode->i_ino);
+                             indirect_blocks + double_blocks, inode->i_ino);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -379,25 +380,25 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
        while (bref < p+max) {
                blk = le32_to_cpu(*bref++);
-                if (blk && 
+                if (blk &&
-                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
                        ext4_error(inode->i_sb, function,
                                   "invalid block reference %u "
                                   "in inode #%lu", blk, inode->i_ino);
-                        return -EIO;
+                        return -EIO;
-                }
+                }
-        }
+        }
-        return 0;
+        return 0;
 }
 #define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
                              EXT4_NDIR_BLOCKS)
 /**
@@ -447,7 +448,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
                bh = sb_getblk(sb, le32_to_cpu(p->key));
                if (unlikely(!bh))
                        goto failure;
-                  
                if (!bh_uptodate_or_lock(bh)) {
                        if (bh_submit_read(bh) < 0) {
                                put_bh(bh);
@@ -459,7 +460,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
                                goto failure;
                        }
                }
-                
                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
@@ -552,7 +553,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 *      returns it.
 */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
-                Indirect *partial)
+                                   Indirect *partial)
 {
        /*
         * XXX need to get goal block from mballoc's data structures
@@ -574,7 +575,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 *      direct and indirect blocks.
 */
 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
-                int blocks_to_boundary)
+                                 int blocks_to_boundary)
 {
        unsigned int count = 0;
@@ -610,9 +611,9 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 *              direct blocks
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                             ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                int indirect_blks, int blks,
+                             int indirect_blks, int blks,
-                                ext4_fsblk_t new_blocks[4], int *err)
+                             ext4_fsblk_t new_blocks[4], int *err)
 {
        struct ext4_allocation_request ar;
        int target, i;
@@ -683,10 +684,10 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        }
        if (!*err) {
                if (target == blks) {
-                /*
+                        /*
-                 * save the new block number
+                         * save the new block number
-                 * for the first direct block
+                         * for the first direct block
-                 */
+                         */
                        new_blocks[index] = current_block;
                }
                blk_allocated += ar.len;
@@ -728,9 +729,9 @@ failed_out:
 *      as described above and return 0.
 */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, int indirect_blks,
+                             ext4_lblk_t iblock, int indirect_blks,
-                                int *blks, ext4_fsblk_t goal,
+                             int *blks, ext4_fsblk_t goal,
-                                ext4_lblk_t *offsets, Indirect *branch)
+                             ext4_lblk_t *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
        int i, n = 0;
@@ -777,7 +778,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                         * the chain to point to the new allocated
                         * data blocks numbers
                         */
-                        for (i=1; i < num; i++)
+                        for (i = 1; i < num; i++)
                                *(branch[n].p + i) = cpu_to_le32(++current_block);
                }
                BUFFER_TRACE(bh, "marking uptodate");
@@ -820,7 +821,8 @@ failed:
 * chain to new block and return 0.
 */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t block, Indirect *where, int num, int blks)
+                              ext4_lblk_t block, Indirect *where, int num,
+                              int blks)
 {
        int i;
        int err = 0;
@@ -852,10 +854,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
        }
        /* We are done with atomic stuff, now do the rest of housekeeping */
-        inode->i_ctime = ext4_current_time(inode);
-        ext4_mark_inode_dirty(handle, inode);
        /* had we spliced it onto indirect block? */
        if (where->bh) {
                /*
@@ -874,8 +872,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
        } else {
                /*
                 * OK, we spliced it into the inode itself on a direct block.
-                 * Inode was dirtied above.
                 */
+                ext4_mark_inode_dirty(handle, inode);
                jbd_debug(5, "splicing direct\n");
        }
        return err;
@@ -921,9 +919,9 @@ err_out:
 * blocks.
 */
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
-                                  ext4_lblk_t iblock, unsigned int maxblocks,
+                               ext4_lblk_t iblock, unsigned int maxblocks,
-                                  struct buffer_head *bh_result,
+                               struct buffer_head *bh_result,
-                                  int flags)
+                               int flags)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -939,7 +937,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, iblock, offsets,
-                                        &blocks_to_boundary);
+                                   &blocks_to_boundary);
        if (depth == 0)
                goto out;
@@ -987,8 +985,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * Block out ext4_truncate while we alter the tree
         */
        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
-                                        &count, goal,
+                                &count, goal,
-                                        offsets + (partial - chain), partial);
+                                offsets + (partial - chain), partial);
        /*
         * The ext4_splice_branch call will free and forget any buffers
@@ -999,8 +997,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         */
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
-                                        partial, indirect_blks, count);
+                                         partial, indirect_blks, count);
-        else 
+        else
                goto cleanup;
        set_buffer_new(bh_result);
@@ -1172,7 +1170,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
        up_read((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
-                int ret = check_block_validity(inode, block, 
+                int ret = check_block_validity(inode, block,
                                               bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
@@ -1254,7 +1252,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
-                int ret = check_block_validity(inode, block, 
+                int ret = check_block_validity(inode, block,
                                               bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
@@ -1405,8 +1403,7 @@ static int walk_page_buffers(handle_t *handle,
        for (bh = head, block_start = 0;
             ret == 0 && (bh != head || !block_start);
-             block_start = block_end, bh = next)
+             block_start = block_end, bh = next) {
-        {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
@@ -1447,7 +1444,7 @@ static int walk_page_buffers(handle_t *handle,
 * write.
 */
 static int do_journal_get_write_access(handle_t *handle,
-                                        struct buffer_head *bh)
+                                       struct buffer_head *bh)
 {
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
@@ -1455,27 +1452,24 @@ static int do_journal_get_write_access(handle_t *handle,
 }
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned flags,
+                            loff_t pos, unsigned len, unsigned flags,
-                                struct page **pagep, void **fsdata)
+                            struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct page *page;
-        pgoff_t index;
+        pgoff_t index;
        unsigned from, to;
-        trace_mark(ext4_write_begin,
+        trace_ext4_write_begin(inode, pos, len, flags);
-                   "dev %s ino %lu pos %llu len %u flags %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, flags);
        /*
         * Reserve one block more for addition to orphan list in case
         * we allocate blocks but write fails for some reason
         */
        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
-        index = pos >> PAGE_CACHE_SHIFT;
+        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1517,14 +1511,14 @@ retry:
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);
                ext4_journal_stop(handle);
                if (pos + len > inode->i_size) {
-                        vmtruncate(inode, inode->i_size);
+                        ext4_truncate(inode);
-                        /* 
+                        /*
-                         * If vmtruncate failed early the inode might
+                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
@@ -1550,9 +1544,9 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 static int ext4_generic_write_end(struct file *file,
-                                struct address_space *mapping,
+                                  struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                                  loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                                  struct page *page, void *fsdata)
 {
        int i_size_changed = 0;
        struct inode *inode = mapping->host;
@@ -1603,25 +1597,22 @@ static int ext4_generic_write_end(struct file *file,
 * buffers are managed internally.
 */
 static int ext4_ordered_write_end(struct file *file,
-                                struct address_space *mapping,
+                                  struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                                  loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                                  struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
-        trace_mark(ext4_ordered_write_end,
+        trace_ext4_ordered_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
                        /* if we have allocated more blocks and copied
                         * less. We will have blocks allocated outside
                         * inode->i_size. So truncate them
@@ -1635,9 +1626,9 @@ static int ext4_ordered_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate(inode);
-                /* 
+                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1650,22 +1641,19 @@ static int ext4_ordered_write_end(struct file *file,
 }
 static int ext4_writeback_write_end(struct file *file,
-                                struct address_space *mapping,
+                                    struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                                    loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                                    struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
-        trace_mark(ext4_writeback_write_end,
+        trace_ext4_writeback_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
@@ -1680,9 +1668,9 @@ static int ext4_writeback_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate(inode);
-                /* 
+                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1694,9 +1682,9 @@ static int ext4_writeback_write_end(struct file *file,
 }
 static int ext4_journalled_write_end(struct file *file,
-                                struct address_space *mapping,
+                                     struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                                     loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                                     struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
@@ -1705,10 +1693,7 @@ static int ext4_journalled_write_end(struct file *file,
        unsigned from, to;
        loff_t new_i_size;
-        trace_mark(ext4_journalled_write_end,
+        trace_ext4_journalled_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1735,7 +1720,7 @@ static int ext4_journalled_write_end(struct file *file,
        unlock_page(page);
        page_cache_release(page);
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
@@ -1746,9 +1731,9 @@ static int ext4_journalled_write_end(struct file *file,
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate(inode);
-                /* 
+                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1854,7 +1839,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 static void ext4_da_page_release_reservation(struct page *page,
-                                                unsigned long offset)
+                                             unsigned long offset)
 {
        int to_release = 0;
        struct buffer_head *head, *bh;
@@ -2318,15 +2303,9 @@ flush_it:
        return;
 }
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
-        /*
+        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
-         * unmapped buffer is possible for holes.
-         * delay buffer is possible with delayed allocation.
-         * We also need to consider unwritten buffer as unmapped.
-         */
-        return (!buffer_mapped(bh) || buffer_delay(bh) ||
-                                buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 /*
@@ -2411,9 +2390,9 @@ static int __mpage_da_writepage(struct page *page,
                         * We need to try to allocate
                         * unmapped blocks in the same page.
                         * Otherwise we won't make progress
-                         * with the page in ext4_da_writepage
+                         * with the page in ext4_writepage
                         */
-                        if (ext4_bh_unmapped_or_delay(NULL, bh)) {
+                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
                                mpage_add_bh_to_extent(mpd, logical,
                                                       bh->b_size,
                                                       bh->b_state);
@@ -2530,7 +2509,6 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
         * so call get_block_wrap with create = 0
         */
        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-        BUG_ON(create && ret == 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -2538,15 +2516,102 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
        return ret;
 }
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+        get_bh(bh);
+        return 0;
+}
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+        put_bh(bh);
+        return 0;
+}
+static int __ext4_journalled_writepage(struct page *page,
+                                       struct writeback_control *wbc,
+                                       unsigned int len)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct buffer_head *page_bufs;
+        handle_t *handle = NULL;
+        int ret = 0;
+        int err;
+        page_bufs = page_buffers(page);
+        BUG_ON(!page_bufs);
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+        /* As soon as we unlock the page, it can go away, but we have
+         * references to buffers so we are safe */
+        unlock_page(page);
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                do_journal_get_write_access);
+        err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                write_end_fn);
+        if (ret == 0)
+                ret = err;
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+out:
+        return ret;
+}
 /*
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
 * This function can get called via...
 *   - ext4_da_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
 *   - shrink_page_list via pdflush (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *              ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
 */
-static int ext4_da_writepage(struct page *page,
+static int ext4_writepage(struct page *page,
-                                struct writeback_control *wbc)
+                          struct writeback_control *wbc)
 {
        int ret = 0;
        loff_t size;
@@ -2554,9 +2619,7 @@ static int ext4_da_writepage(struct page *page,
        struct buffer_head *page_bufs;
        struct inode *inode = page->mapping->host;
-        trace_mark(ext4_da_writepage,
+        trace_ext4_writepage(inode, page);
-                   "dev %s ino %lu page_index %lu",
-                   inode->i_sb->s_id, inode->i_ino, page->index);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2566,7 +2629,7 @@ static int ext4_da_writepage(struct page *page,
        if (page_has_buffers(page)) {
                page_bufs = page_buffers(page);
                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay)) {
+                                        ext4_bh_delay_or_unwritten)) {
                        /*
                         * We don't want to do  block allocation
                         * So redirty the page and return
@@ -2593,13 +2656,13 @@ static int ext4_da_writepage(struct page *page,
                 * all are mapped and non delay. We don't want to
                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                ret = block_prepare_write(page, 0, len,
                                          noalloc_get_block_write);
                if (!ret) {
                        page_bufs = page_buffers(page);
                        /* check whether all are mapped and non delay */
                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_unmapped_or_delay)) {
+                                                ext4_bh_delay_or_unwritten)) {
                                redirty_page_for_writepage(wbc, page);
                                unlock_page(page);
                                return 0;
@@ -2615,7 +2678,16 @@ static int ext4_da_writepage(struct page *page,
                        return 0;
                }
                /* now mark the buffer_heads as dirty and uptodate */
-                block_commit_write(page, 0, PAGE_CACHE_SIZE);
+                block_commit_write(page, 0, len);
+        }
+        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+                /*
+                 * It's mmapped pagecache.  Add buffers and journal it.  There
+                 * doesn't seem much point in redirtying the page here.
+                 */
+                ClearPageChecked(page);
+                return __ext4_journalled_writepage(page, wbc, len);
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2667,19 +2739,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        int needed_blocks, ret = 0, nr_to_writebump = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-        trace_mark(ext4_da_writepages,
+        trace_ext4_da_writepages(inode, wbc);
-                   "dev %s ino %lu nr_t_write %ld "
-                   "pages_skipped %ld range_start %llu "
-                   "range_end %llu nonblocking %d "
-                   "for_kupdate %d for_reclaim %d "
-                   "for_writepages %d range_cyclic %d",
-                   inode->i_sb->s_id, inode->i_ino,
-                   wbc->nr_to_write, wbc->pages_skipped,
-                   (unsigned long long) wbc->range_start,
-                   (unsigned long long) wbc->range_end,
-                   wbc->nonblocking, wbc->for_kupdate,
-                   wbc->for_reclaim, wbc->for_writepages,
-                   wbc->range_cyclic);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
@@ -2693,13 +2753,13 @@ static int ext4_da_writepages(struct address_space *mapping,
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
-         * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+         * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
         * the latter could be true if the filesystem is mounted
         * read-only, and in that case, ext4_da_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
-        if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
        /*
@@ -2845,14 +2905,7 @@ out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
-        trace_mark(ext4_da_writepage_result,
+        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
-                   "dev %s ino %lu ret %d pages_written %d "
-                   "pages_skipped %ld congestion %d "
-                   "more_io %d no_nrwrite_index_update %d",
-                   inode->i_sb->s_id, inode->i_ino, ret,
-                   pages_written, wbc->pages_skipped,
-                   wbc->encountered_congestion, wbc->more_io,
-                   wbc->no_nrwrite_index_update);
        return ret;
 }
@@ -2884,8 +2937,8 @@ static int ext4_nonda_switch(struct super_block *sb)
 }
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned flags,
+                               loff_t pos, unsigned len, unsigned flags,
-                                struct page **pagep, void **fsdata)
+                               struct page **pagep, void **fsdata)
 {
        int ret, retries = 0;
        struct page *page;
@@ -2904,11 +2957,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                        len, flags, pagep, fsdata);
        }
        *fsdata = (void *)0;
+        trace_ext4_da_write_begin(inode, pos, len, flags);
-        trace_mark(ext4_da_write_begin,
-                   "dev %s ino %lu pos %llu len %u flags %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, flags);
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2945,7 +2994,7 @@ retry:
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext4_truncate(inode);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2959,7 +3008,7 @@ out:
 * when write to the end of file but not require block allocation
 */
 static int ext4_da_should_update_i_disksize(struct page *page,
-                                         unsigned long offset)
+                                            unsigned long offset)
 {
        struct buffer_head *bh;
        struct inode *inode = page->mapping->host;
@@ -2978,9 +3027,9 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 }
 static int ext4_da_write_end(struct file *file,
-                                struct address_space *mapping,
+                             struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                             loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                             struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
@@ -3001,10 +3050,7 @@ static int ext4_da_write_end(struct file *file,
                }
        }
-        trace_mark(ext4_da_write_end,
+        trace_ext4_da_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
        start = pos & (PAGE_CACHE_SIZE - 1);
        end = start + copied - 1;
@@ -3081,7 +3127,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * not strictly speaking necessary (and for users of
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
-         * 
+         *
         * ext4_da_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
@@ -3101,7 +3147,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
-         * 
+         *
         * For now, though, we'll cheat by calling filemap_flush(),
         * which will map the blocks, and start the I/O, but not
         * actually wait for the I/O to complete.
@@ -3171,226 +3217,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, ext4_get_block);
 }
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
-        get_bh(bh);
-        return 0;
-}
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
-        put_bh(bh);
-        return 0;
-}
-/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * In all journaling modes block_write_full_page() will start the I/O.
- *
- * Problem:
- *
- *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *              ext4_writepage()
- *
- * Similar for:
- *
- *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_data_sem
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- *          non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
- *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
- *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
- *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
- *
- * We don't honour synchronous mounts for writepage().  That would be
- * disastrous.  Any write() or metadata operation will sync the fs for
- * us.
- *
- */
-static int __ext4_normal_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        if (test_opt(inode->i_sb, NOBH))
-                return nobh_writepage(page, noalloc_get_block_write, wbc);
-        else
-                return block_write_full_page(page, noalloc_get_block_write,
-                                             wbc);
-}
-static int ext4_normal_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_mark(ext4_normal_writepage,
-                   "dev %s ino %lu page_index %lu",
-                   inode->i_sb->s_id, inode->i_ino, page->index);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (!ext4_journal_current_handle())
-                return __ext4_normal_writepage(page, wbc);
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
-static int __ext4_journalled_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct address_space *mapping = page->mapping;
-        struct inode *inode = mapping->host;
-        struct buffer_head *page_bufs;
-        handle_t *handle = NULL;
-        int ret = 0;
-        int err;
-        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                  noalloc_get_block_write);
-        if (ret != 0)
-                goto out_unlock;
-        page_bufs = page_buffers(page);
-        walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-                                                                bget_one);
-        /* As soon as we unlock the page, it can go away, but we have
-         * references to buffers so we are safe */
-        unlock_page(page);
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                goto out;
-        }
-        ret = walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-        err = walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, write_end_fn);
-        if (ret == 0)
-                ret = err;
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-        walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, bput_one);
-        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-        goto out;
-out_unlock:
-        unlock_page(page);
-out:
-        return ret;
-}
-static int ext4_journalled_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_mark(ext4_journalled_writepage,
-                   "dev %s ino %lu page_index %lu",
-                   inode->i_sb->s_id, inode->i_ino, page->index);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (ext4_journal_current_handle())
-                goto no_write;
-        if (PageChecked(page)) {
-                /*
-                 * It's mmapped pagecache.  Add buffers and journal it.  There
-                 * doesn't seem much point in redirtying the page here.
-                 */
-                ClearPageChecked(page);
-                return __ext4_journalled_writepage(page, wbc);
-        } else {
-                /*
-                 * It may be a page full of checkpoint-mode buffers.  We don't
-                 * really know unless we go poke around in the buffer_heads.
-                 * But block_write_full_page will do the right thing.
-                 */
-                return block_write_full_page(page, noalloc_get_block_write,
-                                             wbc);
-        }
-no_write:
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
 static int ext4_readpage(struct file *file, struct page *page)
 {
        return mpage_readpage(page, ext4_get_block);
@@ -3442,8 +3268,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 * VFS code falls back into buffered path in that case so we are safe.
 */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-                        const struct iovec *iov, loff_t offset,
+                              const struct iovec *iov, loff_t offset,
-                        unsigned long nr_segs)
+                              unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
@@ -3537,7 +3363,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
@@ -3552,7 +3378,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
@@ -3567,7 +3393,7 @@ static const struct address_space_operations ext4_writeback_aops = {
 static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_journalled_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
@@ -3581,7 +3407,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 static const struct address_space_operations ext4_da_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_da_writepage,
+        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
@@ -3628,7 +3454,8 @@ int ext4_block_truncate_page(handle_t *handle,
        struct page *page;
        int err = 0;
-        page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
        if (!page)
                return -EINVAL;
@@ -3763,7 +3590,8 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *                      (no partially truncated stuff there).  */
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
-                        ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
+                                  ext4_lblk_t offsets[4], Indirect chain[4],
+                                  __le32 *top)
 {
        Indirect *partial, *p;
        int k, err;
@@ -3819,8 +3647,10 @@ no_top:
 * than `count' because there can be holes in there.
 */
 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t block_to_free,
+                              struct buffer_head *bh,
-                unsigned long count, __le32 *first, __le32 *last)
+                              ext4_fsblk_t block_to_free,
+                              unsigned long count, __le32 *first,
+                              __le32 *last)
 {
        __le32 *p;
        if (try_to_extend_transaction(handle, inode)) {
@@ -3837,10 +3667,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
        }
        /*
-         * Any buffers which are on the journal will be in memory. We find
+         * Any buffers which are on the journal will be in memory. We
-         * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
+         * find them on the hash table so jbd2_journal_revoke() will
-         * on them.  We've already detached each block from the file, so
+         * run jbd2_journal_forget() on them.  We've already detached
-         * bforget() in jbd2_journal_forget() should be safe.
+         * each block from the file, so bforget() in
+         * jbd2_journal_forget() should be safe.
         *
         * AKPM: turn on bforget in jbd2_journal_forget()!!!
         */
@@ -4212,7 +4043,7 @@ void ext4_truncate(struct inode *inode)
                                   (__le32*)partial->bh->b_data+addr_per_block,
                                   (chain+n-1) - partial);
                BUFFER_TRACE(partial->bh, "call brelse");
-                brelse (partial->bh);
+                brelse(partial->bh);
                partial--;
        }
 do_indirects:
@@ -4453,8 +4284,9 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
        if (flags & S_DIRSYNC)
                ei->i_flags |= EXT4_DIRSYNC_FL;
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
-                                        struct ext4_inode_info *ei)
+                                  struct ext4_inode_info *ei)
 {
        blkcnt_t i_blocks ;
        struct inode *inode = &(ei->vfs_inode);
@@ -4493,10 +4325,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        ei->i_acl = EXT4_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
@@ -4569,7 +4397,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                        EXT4_GOOD_OLD_INODE_SIZE +
                                        ei->i_extra_isize;
                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-                                 ei->i_state |= EXT4_STATE_XATTR;
+                                ei->i_state |= EXT4_STATE_XATTR;
                }
        } else
                ei->i_extra_isize = 0;
@@ -4588,7 +4416,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
-            ((ei->i_file_acl < 
+            ((ei->i_file_acl <
              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
               EXT4_SB(sb)->s_gdb_count)) ||
             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
@@ -4603,15 +4431,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                     !ext4_inode_is_fast_symlink(inode)))
                        /* Validate extent which is part of inode */
                        ret = ext4_ext_check_inode(inode);
-        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                   (S_ISLNK(inode->i_mode) &&
                    !ext4_inode_is_fast_symlink(inode))) {
-                /* Validate block references which are part of inode */
+                /* Validate block references which are part of inode */
                ret = ext4_check_inode_blockref(inode);
        }
        if (ret) {
-                brelse(bh);
+                brelse(bh);
-                goto bad_inode;
+                goto bad_inode;
        }
        if (S_ISREG(inode->i_mode)) {
@@ -4642,7 +4470,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        } else {
                brelse(bh);
                ret = -EIO;
-                ext4_error(inode->i_sb, __func__, 
+                ext4_error(inode->i_sb, __func__,
                           "bogus i_mode (%o) for inode=%lu",
                           inode->i_mode, inode->i_ino);
                goto bad_inode;
@@ -4795,8 +4623,9 @@ static int ext4_do_update_inode(handle_t *handle,
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
-        } else for (block = 0; block < EXT4_N_BLOCKS; block++)
+        } else
-                raw_inode->i_block[block] = ei->i_data[block];
+                for (block = 0; block < EXT4_N_BLOCKS; block++)
+                        raw_inode->i_block[block] = ei->i_data[block];
        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
        if (ei->i_extra_isize) {
@@ -5150,7 +4979,7 @@ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
 * Give this, we know that the caller already has write access to iloc->bh.
 */
 int ext4_mark_iloc_dirty(handle_t *handle,
-                struct inode *inode, struct ext4_iloc *iloc)
+                         struct inode *inode, struct ext4_iloc *iloc)
 {
        int err = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 91e75f7a9e73..7050a9cd04a4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -12,8 +12,8 @@
 #include <linux/capability.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -191,7 +191,7 @@ setversion_out:
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
-                int err, err2;
+                int err, err2=0;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -204,19 +204,56 @@ setversion_out:
                        return err;
                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
-                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                if (EXT4_SB(sb)->s_journal) {
-                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                }
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
+        case EXT4_IOC_MOVE_EXT: {
+                struct move_extent me;
+                struct file *donor_filp;
+                int err;
+                if (copy_from_user(&me,
+                        (struct move_extent __user *)arg, sizeof(me)))
+                        return -EFAULT;
+                donor_filp = fget(me.donor_fd);
+                if (!donor_filp)
+                        return -EBADF;
+                if (!capable(CAP_DAC_OVERRIDE)) {
+                        if ((current->real_cred->fsuid != inode->i_uid) ||
+                                !(inode->i_mode & S_IRUSR) ||
+                                !(donor_filp->f_dentry->d_inode->i_mode &
+                                S_IRUSR)) {
+                                fput(donor_filp);
+                                return -EACCES;
+                        }
+                }
+                err = ext4_move_extents(filp, donor_filp, me.orig_start,
+                                        me.donor_start, me.len, &me.moved_len);
+                fput(donor_filp);
+                if (!err)
+                        if (copy_to_user((struct move_extent *)arg,
+                                &me, sizeof(me)))
+                                return -EFAULT;
+                return err;
+        }
        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;
                struct super_block *sb = inode->i_sb;
-                int err, err2;
+                int err, err2=0;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -230,9 +267,11 @@ setversion_out:
                        return err;
                err = ext4_group_add(sb, &input);
-                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                if (EXT4_SB(sb)->s_journal) {
-                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                }
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ed8482e22c0e..cd258463e2a9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,8 @@
 */
 #include "mballoc.h"
+#include <trace/events/ext4.h>
 /*
 * MUSTDO:
 *   - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
 #if BITS_PER_LONG == 64
@@ -657,7 +657,8 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
-static void ext4_mb_generate_buddy(struct super_block *sb,
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -1480,7 +1481,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
        ext4_mb_check_limits(ac, e4b, 0);
 }
-static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1507,7 +1509,8 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
        return 0;
 }
-static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                                struct ext4_buddy *e4b)
 {
        ext4_group_t group = ac->ac_g_ex.fe_group;
@@ -1566,7 +1569,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 * The routine scans buddy structures (not bitmap!) from given order
 * to max order and tries to find big enough chunk to satisfy the req
 */
-static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1609,7 +1613,8 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 * In order to optimize scanning, caller must pass number of
 * free blocks in the group, so the routine can know upper limit.
 */
-static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1668,7 +1673,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 * we try to find stripe-aligned chunks for stripe-size requests
 * XXX should do so at least for multiples of stripe size as well
 */
-static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                                 struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1831,7 +1837,8 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
 }
-static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
        int ret;
@@ -2859,9 +2866,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
                        + entry->start_blk
                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
+                trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
-                           sb->s_id, (unsigned long long) discard_block,
+                                          entry->count);
-                           entry->count);
                sb_issue_discard(sb, discard_block, entry->count);
                kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2903,7 +2909,11 @@ int __init init_ext4_mballoc(void)
 void exit_ext4_mballoc(void)
 {
-        /* XXX: synchronize_rcu(); */
+        /* 
+         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+         * before destroying the slab cache.
+         */
+        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
@@ -3458,7 +3468,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 * used in in-core bitmap. buddy must be generated from this bitmap
 * Need to be called with ext4 group lock held
 */
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -3629,10 +3640,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-        trace_mark(ext4_mb_new_inode_pa,
+        trace_ext4_mb_new_inode_pa(ac, pa);
-                   "dev %s ino %lu pstart %llu len %u lstart %u",
-                   sb->s_id, ac->ac_inode->i_ino,
-                   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_inode_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3691,9 +3699,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_type = MB_GROUP_PA;
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-                 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-        trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+        trace_ext4_mb_new_group_pa(ac, pa);
-                   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3783,10 +3790,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        ext4_mb_store_history(ac);
                }
-                trace_mark(ext4_mb_release_inode_pa,
+                trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
-                           "dev %s ino %lu block %llu count %u",
+                                               next - bit);
-                           sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
-                           next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3820,8 +3825,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        if (ac)
                ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-        trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+        trace_ext4_mb_release_group_pa(ac, pa);
-                   sb->s_id, pa->pa_pstart, pa->pa_len);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3889,6 +3893,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        INIT_LIST_HEAD(&list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+        if (ac)
+                ac->ac_sb = sb;
 repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
@@ -3987,12 +3993,15 @@ void ext4_discard_preallocations(struct inode *inode)
        }
        mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
-        trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+        trace_ext4_discard_preallocations(inode);
-                   inode->i_ino);
        INIT_LIST_HEAD(&list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+        if (ac) {
+                ac->ac_sb = sb;
+                ac->ac_inode = inode;
+        }
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
@@ -4218,14 +4227,9 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ext4_get_group_no_and_offset(sb, goal, &group, &block);
        /* set up allocation goals */
+        memset(ac, 0, sizeof(struct ext4_allocation_context));
        ac->ac_b_ex.fe_logical = ar->logical;
-        ac->ac_b_ex.fe_group = 0;
-        ac->ac_b_ex.fe_start = 0;
-        ac->ac_b_ex.fe_len = 0;
        ac->ac_status = AC_STATUS_CONTINUE;
-        ac->ac_groups_scanned = 0;
-        ac->ac_ex_scanned = 0;
-        ac->ac_found = 0;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
        ac->ac_o_ex.fe_logical = ar->logical;
@@ -4236,15 +4240,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ac->ac_g_ex.fe_group = group;
        ac->ac_g_ex.fe_start = block;
        ac->ac_g_ex.fe_len = len;
-        ac->ac_f_ex.fe_len = 0;
        ac->ac_flags = ar->flags;
-        ac->ac_2order = 0;
-        ac->ac_criteria = 0;
-        ac->ac_pa = NULL;
-        ac->ac_bitmap_page = NULL;
-        ac->ac_buddy_page = NULL;
-        ac->alloc_semp = NULL;
-        ac->ac_lg = NULL;
        /* we have to define context: we'll we work with a file or
         * locality group. this is a policy, actually */
@@ -4276,6 +4272,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        INIT_LIST_HEAD(&discard_list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+        if (ac)
+                ac->ac_sb = sb;
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4445,8 +4443,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
        int ret;
        int freed = 0;
-        trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+        trace_ext4_mb_discard_preallocations(sb, needed);
-                   sb->s_id, needed);
        for (i = 0; i < ngroups && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
@@ -4475,17 +4472,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
-        trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+        trace_ext4_request_blocks(ar);
-                   "lblk %llu goal %llu lleft %llu lright %llu "
-                   "pleft %llu pright %llu ",
-                   sb->s_id, ar->flags, ar->len,
-                   ar->inode ? ar->inode->i_ino : 0,
-                   (unsigned long long) ar->logical,
-                   (unsigned long long) ar->goal,
-                   (unsigned long long) ar->lleft,
-                   (unsigned long long) ar->lright,
-                   (unsigned long long) ar->pleft,
-                   (unsigned long long) ar->pright);
        /*
         * For delayed allocation, we could skip the ENOSPC and
@@ -4594,18 +4581,7 @@ out3:
                                                reserv_blks);
        }
-        trace_mark(ext4_allocate_blocks,
+        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
-                   "dev %s block %llu flags %u len %u ino %lu "
-                   "logical %llu goal %llu lleft %llu lright %llu "
-                   "pleft %llu pright %llu ",
-                   sb->s_id, (unsigned long long) block,
-                   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
-                   (unsigned long long) ar->logical,
-                   (unsigned long long) ar->goal,
-                   (unsigned long long) ar->lleft,
-                   (unsigned long long) ar->lright,
-                   (unsigned long long) ar->pleft,
-                   (unsigned long long) ar->pright);
        return block;
 }
@@ -4709,7 +4685,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 * Main entry point into mballoc to free blocks
 */
 void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long block, unsigned long count,
+                        ext4_fsblk_t block, unsigned long count,
                        int metadata, unsigned long *freed)
 {
        struct buffer_head *bitmap_bh = NULL;
@@ -4735,15 +4711,12 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
            block + count > ext4_blocks_count(es)) {
                ext4_error(sb, __func__,
                            "Freeing blocks not in datazone - "
-                            "block = %lu, count = %lu", block, count);
+                            "block = %llu, count = %lu", block, count);
                goto error_return;
        }
-        ext4_debug("freeing block %lu\n", block);
+        ext4_debug("freeing block %llu\n", block);
-        trace_mark(ext4_free_blocks,
+        trace_ext4_free_blocks(inode, block, count, metadata);
-                   "dev %s block %llu count %lu metadata %d ino %lu",
-                   sb->s_id, (unsigned long long) block, count, metadata,
-                   inode ? inode->i_ino : 0);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4784,7 +4757,7 @@ do_more:
                ext4_error(sb, __func__,
                           "Freeing blocks in system zone - "
-                           "Block = %lu, count = %lu", block, count);
+                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_return;
        }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 75e34f69215b..c96bb19f58f9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/version.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fe64d9f79852..313a50b39741 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode)
        struct inode *tmp_inode = NULL;
        struct list_blocks_struct lb;
        unsigned long max_entries;
+        __u32 goal;
        /*
         * If the filesystem does not support extents, or the inode
@@ -483,9 +484,10 @@ int ext4_ext_migrate(struct inode *inode)
                retval = PTR_ERR(handle);
                return retval;
        }
-        tmp_inode = ext4_new_inode(handle,
+        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
-                                inode->i_sb->s_root->d_inode,
+                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
-                                S_IFREG);
+        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+                                   S_IFREG, 0, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 000000000000..bbf2dd9404dc
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "ext4.h"
+#define get_ext_path(path, inode, block, ret)           \
+        do {                                                            \
+                path = ext4_ext_find_extent(inode, block, path);        \
+                if (IS_ERR(path)) {                                     \
+                        ret = PTR_ERR(path);                            \
+                        path = NULL;                                    \
+                }                                                       \
+        } while (0)
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src:        an extent for getting initialize status
+ * @dest:       an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+        if (ext4_ext_is_uninitialized(src))
+                ext4_ext_mark_uninitialized(dest);
+        else
+                dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:      inode which is searched
+ * @path:       this will obtain data for the next extent
+ * @extent:     pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+                      struct ext4_extent **extent)
+{
+        int ppos, leaf_ppos = path->p_depth;
+        ppos = leaf_ppos;
+        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+                /* leaf block */
+                *extent = ++path[ppos].p_ext;
+                return 0;
+        }
+        while (--ppos >= 0) {
+                if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+                    path[ppos].p_idx) {
+                        int cur_ppos = ppos;
+                        /* index block */
+                        path[ppos].p_idx++;
+                        path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                        if (path[ppos+1].p_bh)
+                                brelse(path[ppos+1].p_bh);
+                        path[ppos+1].p_bh =
+                                sb_bread(inode->i_sb, path[ppos].p_block);
+                        if (!path[ppos+1].p_bh)
+                                return -EIO;
+                        path[ppos+1].p_hdr =
+                                ext_block_hdr(path[ppos+1].p_bh);
+                        /* Halfway index block */
+                        while (++cur_ppos < leaf_ppos) {
+                                path[cur_ppos].p_idx =
+                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+                                path[cur_ppos].p_block =
+                                        idx_pblock(path[cur_ppos].p_idx);
+                                if (path[cur_ppos+1].p_bh)
+                                        brelse(path[cur_ppos+1].p_bh);
+                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+                                        path[cur_ppos].p_block);
+                                if (!path[cur_ppos+1].p_bh)
+                                        return -EIO;
+                                path[cur_ppos+1].p_hdr =
+                                        ext_block_hdr(path[cur_ppos+1].p_bh);
+                        }
+                        /* leaf block */
+                        path[leaf_ppos].p_ext = *extent =
+                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+                        return 0;
+                }
+        }
+        /* We found the last extent */
+        return 1;
+}
+/**
+ * mext_double_down_read - Acquire two inodes' read semaphore
+ *
+ * @orig_inode:         original inode structure
+ * @donor_inode:        donor inode structure
+ * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+        struct inode *first = orig_inode, *second = donor_inode;
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        /*
+         * Use the inode number to provide the stable locking order instead
+         * of its address, because the C language doesn't guarantee you can
+         * compare pointers that don't come from the same array.
+         */
+        if (donor_inode->i_ino < orig_inode->i_ino) {
+                first = donor_inode;
+                second = orig_inode;
+        }
+        down_read(&EXT4_I(first)->i_data_sem);
+        down_read(&EXT4_I(second)->i_data_sem);
+}
+/**
+ * mext_double_down_write - Acquire two inodes' write semaphore
+ *
+ * @orig_inode:         original inode structure
+ * @donor_inode:        donor inode structure
+ * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+        struct inode *first = orig_inode, *second = donor_inode;
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        /*
+         * Use the inode number to provide the stable locking order instead
+         * of its address, because the C language doesn't guarantee you can
+         * compare pointers that don't come from the same array.
+         */
+        if (donor_inode->i_ino < orig_inode->i_ino) {
+                first = donor_inode;
+                second = orig_inode;
+        }
+        down_write(&EXT4_I(first)->i_data_sem);
+        down_write(&EXT4_I(second)->i_data_sem);
+}
+/**
+ * mext_double_up_read - Release two inodes' read semaphore
+ *
+ * @orig_inode:         original inode structure to be released its lock first
+ * @donor_inode:        donor inode structure to be released its lock second
+ * Release read semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        up_read(&EXT4_I(orig_inode)->i_data_sem);
+        up_read(&EXT4_I(donor_inode)->i_data_sem);
+}
+/**
+ * mext_double_up_write - Release two inodes' write semaphore
+ *
+ * @orig_inode:         original inode structure to be released its lock first
+ * @donor_inode:        donor inode structure to be released its lock second
+ * Release write semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        up_write(&EXT4_I(orig_inode)->i_data_sem);
+        up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle:             journal handle
+ * @orig_inode:         original inode
+ * @o_start:            first original extent to be changed
+ * @o_end:              last original extent to be changed
+ * @start_ext:          first new extent to be inserted
+ * @new_ext:            middle of new extent to be inserted
+ * @end_ext:            last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+                struct ext4_extent *o_start, struct ext4_extent *o_end,
+                struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+                struct ext4_extent *end_ext)
+{
+        struct ext4_ext_path *orig_path = NULL;
+        ext4_lblk_t eblock = 0;
+        int new_flag = 0;
+        int end_flag = 0;
+        int err = 0;
+        if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+                if (o_start == o_end) {
+                        /*       start_ext   new_ext    end_ext
+                         * donor |---------|-----------|--------|
+                         * orig  |------------------------------|
+                         */
+                        end_flag = 1;
+                } else {
+                        /*       start_ext   new_ext   end_ext
+                         * donor |---------|----------|---------|
+                         * orig  |---------------|--------------|
+                         */
+                        o_end->ee_block = end_ext->ee_block;
+                        o_end->ee_len = end_ext->ee_len;
+                        ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                }
+                o_start->ee_len = start_ext->ee_len;
+                new_flag = 1;
+        } else if (start_ext->ee_len && new_ext->ee_len &&
+                   !end_ext->ee_len && o_start == o_end) {
+                /*       start_ext      new_ext
+                 * donor |--------------|---------------|
+                 * orig  |------------------------------|
+                 */
+                o_start->ee_len = start_ext->ee_len;
+                new_flag = 1;
+        } else if (!start_ext->ee_len && new_ext->ee_len &&
+                   end_ext->ee_len && o_start == o_end) {
+                /*        new_ext       end_ext
+                 * donor |--------------|---------------|
+                 * orig  |------------------------------|
+                 */
+                o_end->ee_block = end_ext->ee_block;
+                o_end->ee_len = end_ext->ee_len;
+                ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                /*
+                 * Set 0 to the extent block if new_ext was
+                 * the first block.
+                 */
+                if (new_ext->ee_block)
+                        eblock = le32_to_cpu(new_ext->ee_block);
+                new_flag = 1;
+        } else {
+                ext4_debug("ext4 move extent: Unexpected insert case\n");
+                return -EIO;
+        }
+        if (new_flag) {
+                get_ext_path(orig_path, orig_inode, eblock, err);
+                if (orig_path == NULL)
+                        goto out;
+                if (ext4_ext_insert_extent(handle, orig_inode,
+                                        orig_path, new_ext))
+                        goto out;
+        }
+        if (end_flag) {
+                get_ext_path(orig_path, orig_inode,
+                                      le32_to_cpu(end_ext->ee_block) - 1, err);
+                if (orig_path == NULL)
+                        goto out;
+                if (ext4_ext_insert_extent(handle, orig_inode,
+                                           orig_path, end_ext))
+                        goto out;
+        }
+out:
+        if (orig_path) {
+                ext4_ext_drop_refs(orig_path);
+                kfree(orig_path);
+        }
+        return err;
+}
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start:            first original extent to be moved
+ * @o_end:              last original extent to be moved
+ * @start_ext:          first new extent to be inserted
+ * @new_ext:            middle of new extent to be inserted
+ * @end_ext:            last new extent to be inserted
+ * @eh:                 extent header of target leaf block
+ * @range_to_move:      used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+                              struct ext4_extent *o_end,
+                              struct ext4_extent *start_ext,
+                              struct ext4_extent *new_ext,
+                              struct ext4_extent *end_ext,
+                              struct ext4_extent_header *eh,
+                              int range_to_move)
+{
+        int i = 0;
+        unsigned long len;
+        /* Move the existing extents */
+        if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+                len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+                        (unsigned long)(o_end + 1);
+                memmove(o_end + 1 + range_to_move, o_end + 1, len);
+        }
+        /* Insert start entry */
+        if (start_ext->ee_len)
+                o_start[i++].ee_len = start_ext->ee_len;
+        /* Insert new entry */
+        if (new_ext->ee_len) {
+                o_start[i] = *new_ext;
+                ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+        }
+        /* Insert end entry */
+        if (end_ext->ee_len)
+                o_start[i] = *end_ext;
+        /* Increment the total entries counter on the extent block */
+        le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle:     journal handle
+ * @orig_inode: original inode
+ * @orig_path:  path indicates first extent to be changed
+ * @o_start:    first original extent to be changed
+ * @o_end:      last original extent to be changed
+ * @start_ext:  first new extent to be inserted
+ * @new_ext:    middle of new extent to be inserted
+ * @end_ext:    last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+                         struct ext4_ext_path *orig_path,
+                         struct ext4_extent *o_start,
+                         struct ext4_extent *o_end,
+                         struct ext4_extent *start_ext,
+                         struct ext4_extent *new_ext,
+                         struct ext4_extent *end_ext)
+{
+        struct  ext4_extent_header *eh;
+        unsigned long need_slots, slots_range;
+        int     range_to_move, depth, ret;
+        /*
+         * The extents need to be inserted
+         * start_extent + new_extent + end_extent.
+         */
+        need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+                (new_ext->ee_len ? 1 : 0);
+        /* The number of slots between start and end */
+        slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+                / sizeof(struct ext4_extent);
+        /* Range to move the end of extent */
+        range_to_move = need_slots - slots_range;
+        depth = orig_path->p_depth;
+        orig_path += depth;
+        eh = orig_path->p_hdr;
+        if (depth) {
+                /* Register to journal */
+                ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+                if (ret)
+                        return ret;
+        }
+        /* Expansion */
+        if (range_to_move > 0 &&
+                (range_to_move > le16_to_cpu(eh->eh_max)
+                        - le16_to_cpu(eh->eh_entries))) {
+                ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+                                        o_end, start_ext, new_ext, end_ext);
+                if (ret < 0)
+                        return ret;
+        } else
+                mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+                                                end_ext, eh, range_to_move);
+        if (depth) {
+                ret = ext4_handle_dirty_metadata(handle, orig_inode,
+                                                 orig_path->p_bh);
+                if (ret)
+                        return ret;
+        } else {
+                ret = ext4_mark_inode_dirty(handle, orig_inode);
+                if (ret < 0)
+                        return ret;
+        }
+        return 0;
+}
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle:             journal handle
+ * @orig_inode:         original inode
+ * @orig_path:          path indicates first extent to be changed
+ * @dext:               donor extent
+ * @from:               start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+                     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+                     ext4_lblk_t *from)
+{
+        struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+        struct ext4_extent new_ext, start_ext, end_ext;
+        ext4_lblk_t new_ext_end;
+        ext4_fsblk_t new_phys_end;
+        int oext_alen, new_ext_alen, end_ext_alen;
+        int depth = ext_depth(orig_inode);
+        int ret;
+        o_start = o_end = oext = orig_path[depth].p_ext;
+        oext_alen = ext4_ext_get_actual_len(oext);
+        start_ext.ee_len = end_ext.ee_len = 0;
+        new_ext.ee_block = cpu_to_le32(*from);
+        ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+        new_ext.ee_len = dext->ee_len;
+        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+        new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+        /*
+         * Case: original extent is first
+         * oext      |--------|
+         * new_ext      |--|
+         * start_ext |--|
+         */
+        if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+                le32_to_cpu(new_ext.ee_block) <
+                le32_to_cpu(oext->ee_block) + oext_alen) {
+                start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+                                               le32_to_cpu(oext->ee_block));
+                copy_extent_status(oext, &start_ext);
+        } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+                prev_ext = oext - 1;
+                /*
+                 * We can merge new_ext into previous extent,
+                 * if these are contiguous and same extent type.
+                 */
+                if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+                                               &new_ext)) {
+                        o_start = prev_ext;
+                        start_ext.ee_len = cpu_to_le16(
+                                ext4_ext_get_actual_len(prev_ext) +
+                                new_ext_alen);
+                        copy_extent_status(prev_ext, &start_ext);
+                        new_ext.ee_len = 0;
+                }
+        }
+        /*
+         * Case: new_ext_end must be less than oext
+         * oext      |-----------|
+         * new_ext       |-------|
+         */
+        BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+        /*
+         * Case: new_ext is smaller than original extent
+         * oext    |---------------|
+         * new_ext |-----------|
+         * end_ext             |---|
+         */
+        if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+                new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+                end_ext.ee_len =
+                        cpu_to_le16(le32_to_cpu(oext->ee_block) +
+                        oext_alen - 1 - new_ext_end);
+                copy_extent_status(oext, &end_ext);
+                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+                ext4_ext_store_pblock(&end_ext,
+                        (ext_pblock(o_end) + oext_alen - end_ext_alen));
+                end_ext.ee_block =
+                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+                        oext_alen - end_ext_alen);
+        }
+        ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+                                o_end, &start_ext, &new_ext, &end_ext);
+        return ret;
+}
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext:           the extent that will belong to the original inode
+ * @tmp_oext:           the extent that will belong to the donor inode
+ * @orig_off:           block offset of original inode
+ * @donor_off:          block offset of donor inode
+ * @max_count:          the maximun length of extents
+ */
+static void
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+                              struct ext4_extent *tmp_oext,
+                              ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+                              ext4_lblk_t max_count)
+{
+        ext4_lblk_t diff, orig_diff;
+        struct ext4_extent dext_old, oext_old;
+        dext_old = *tmp_dext;
+        oext_old = *tmp_oext;
+        /* When tmp_dext is too large, pick up the target range. */
+        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+        ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+        tmp_dext->ee_block =
+                        cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+        tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+        if (max_count < ext4_ext_get_actual_len(tmp_dext))
+                tmp_dext->ee_len = cpu_to_le16(max_count);
+        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+        ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+        /* Adjust extent length if donor extent is larger than orig */
+        if (ext4_ext_get_actual_len(tmp_dext) >
+            ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+                tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+                                                orig_diff);
+        tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+        copy_extent_status(&oext_old, tmp_dext);
+        copy_extent_status(&dext_old, tmp_oext);
+}
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle:             journal handle
+ * @orig_inode:         original inode
+ * @donor_inode:        donor inode
+ * @from:               block offset of orig_inode
+ * @count:              block count to be replaced
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ *    dummy extents.
+ * 2. Change the block information of original inode to point at the
+ *    donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ *    original inode blocks in the dummy extents.
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+                           struct inode *donor_inode, ext4_lblk_t from,
+                           ext4_lblk_t count)
+{
+        struct ext4_ext_path *orig_path = NULL;
+        struct ext4_ext_path *donor_path = NULL;
+        struct ext4_extent *oext, *dext;
+        struct ext4_extent tmp_dext, tmp_oext;
+        ext4_lblk_t orig_off = from, donor_off = from;
+        int err = 0;
+        int depth;
+        int replaced_count = 0;
+        int dext_alen;
+        mext_double_down_write(orig_inode, donor_inode);
+        /* Get the original extent for the block "orig_off" */
+        get_ext_path(orig_path, orig_inode, orig_off, err);
+        if (orig_path == NULL)
+                goto out;
+        /* Get the donor extent for the head */
+        get_ext_path(donor_path, donor_inode, donor_off, err);
+        if (donor_path == NULL)
+                goto out;
+        depth = ext_depth(orig_inode);
+        oext = orig_path[depth].p_ext;
+        tmp_oext = *oext;
+        depth = ext_depth(donor_inode);
+        dext = donor_path[depth].p_ext;
+        tmp_dext = *dext;
+        mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                                      donor_off, count);
+        /* Loop for the donor extents */
+        while (1) {
+                /* The extent for donor must be found. */
+                BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+                /* Set donor extent to orig extent */
+                err = mext_leaf_block(handle, orig_inode,
+                                           orig_path, &tmp_dext, &orig_off);
+                if (err < 0)
+                        goto out;
+                /* Set orig extent to donor extent */
+                err = mext_leaf_block(handle, donor_inode,
+                                           donor_path, &tmp_oext, &donor_off);
+                if (err < 0)
+                        goto out;
+                dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+                replaced_count += dext_alen;
+                donor_off += dext_alen;
+                orig_off += dext_alen;
+                /* Already moved the expected blocks */
+                if (replaced_count >= count)
+                        break;
+                if (orig_path)
+                        ext4_ext_drop_refs(orig_path);
+                get_ext_path(orig_path, orig_inode, orig_off, err);
+                if (orig_path == NULL)
+                        goto out;
+                depth = ext_depth(orig_inode);
+                oext = orig_path[depth].p_ext;
+                if (le32_to_cpu(oext->ee_block) +
+                                ext4_ext_get_actual_len(oext) <= orig_off) {
+                        err = 0;
+                        goto out;
+                }
+                tmp_oext = *oext;
+                if (donor_path)
+                        ext4_ext_drop_refs(donor_path);
+                get_ext_path(donor_path, donor_inode,
+                                      donor_off, err);
+                if (donor_path == NULL)
+                        goto out;
+                depth = ext_depth(donor_inode);
+                dext = donor_path[depth].p_ext;
+                if (le32_to_cpu(dext->ee_block) +
+                                ext4_ext_get_actual_len(dext) <= donor_off) {
+                        err = 0;
+                        goto out;
+                }
+                tmp_dext = *dext;
+                mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                                              donor_off,
+                                              count - replaced_count);
+        }
+out:
+        if (orig_path) {
+                ext4_ext_drop_refs(orig_path);
+                kfree(orig_path);
+        }
+        if (donor_path) {
+                ext4_ext_drop_refs(donor_path);
+                kfree(donor_path);
+        }
+        mext_double_up_write(orig_inode, donor_inode);
+        return err;
+}
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp:                     file structure of original file
+ * @donor_inode:                donor inode
+ * @orig_page_offset:           page index on original file
+ * @data_offset_in_page:        block index where data swapping starts
+ * @block_len_in_page:          the number of blocks to be swapped
+ * @uninit:                     orig extent is uninitialized or not
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+                  pgoff_t orig_page_offset, int data_offset_in_page,
+                  int block_len_in_page, int uninit)
+{
+        struct inode *orig_inode = o_filp->f_dentry->d_inode;
+        struct address_space *mapping = orig_inode->i_mapping;
+        struct buffer_head *bh;
+        struct page *page = NULL;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        handle_t *handle;
+        ext4_lblk_t orig_blk_offset;
+        long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+        unsigned int w_flags = 0;
+        unsigned int tmp_data_len, data_len;
+        void *fsdata;
+        int ret, i, jblocks;
+        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+        /*
+         * It needs twice the amount of ordinary journal buffers because
+         * inode and donor_inode may change each different metadata blocks.
+         */
+        jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+        handle = ext4_journal_start(orig_inode, jblocks);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                return ret;
+        }
+        if (segment_eq(get_fs(), KERNEL_DS))
+                w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+        orig_blk_offset = orig_page_offset * blocks_per_page +
+                data_offset_in_page;
+        /*
+         * If orig extent is uninitialized one,
+         * it's not necessary force the page into memory
+         * and then force it to be written out again.
+         * Just swap data blocks between orig and donor.
+         */
+        if (uninit) {
+                ret = mext_replace_branches(handle, orig_inode,
+                                                 donor_inode, orig_blk_offset,
+                                                 block_len_in_page);
+                /* Clear the inode cache not to refer to the old data */
+                ext4_ext_invalidate_cache(orig_inode);
+                ext4_ext_invalidate_cache(donor_inode);
+                goto out2;
+        }
+        offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+        /* Calculate data_len */
+        if ((orig_blk_offset + block_len_in_page - 1) ==
+            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+                /* Replace the last block */
+                tmp_data_len = orig_inode->i_size & (blocksize - 1);
+                /*
+                 * If data_len equal zero, it shows data_len is multiples of
+                 * blocksize. So we set appropriate value.
+                 */
+                if (tmp_data_len == 0)
+                        tmp_data_len = blocksize;
+                data_len = tmp_data_len +
+                        ((block_len_in_page - 1) << orig_inode->i_blkbits);
+        } else {
+                data_len = block_len_in_page << orig_inode->i_blkbits;
+        }
+        ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+                                 &page, &fsdata);
+        if (unlikely(ret < 0))
+                goto out;
+        if (!PageUptodate(page)) {
+                mapping->a_ops->readpage(o_filp, page);
+                lock_page(page);
+        }
+        /*
+         * try_to_release_page() doesn't call releasepage in writeback mode.
+         * We should care about the order of writing to the same file
+         * by multiple move extent processes.
+         * It needs to call wait_on_page_writeback() to wait for the
+         * writeback of the page.
+         */
+        if (PageWriteback(page))
+                wait_on_page_writeback(page);
+        /* Release old bh and drop refs */
+        try_to_release_page(page, 0);
+        ret = mext_replace_branches(handle, orig_inode, donor_inode,
+                                         orig_blk_offset, block_len_in_page);
+        if (ret < 0)
+                goto out;
+        /* Clear the inode cache not to refer to the old data */
+        ext4_ext_invalidate_cache(orig_inode);
+        ext4_ext_invalidate_cache(donor_inode);
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+        bh = page_buffers(page);
+        for (i = 0; i < data_offset_in_page; i++)
+                bh = bh->b_this_page;
+        for (i = 0; i < block_len_in_page; i++) {
+                ret = ext4_get_block(orig_inode,
+                                (sector_t)(orig_blk_offset + i), bh, 0);
+                if (ret < 0)
+                        goto out;
+                if (bh->b_this_page != NULL)
+                        bh = bh->b_this_page;
+        }
+        ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+                               page, fsdata);
+        page = NULL;
+out:
+        if (unlikely(page)) {
+                if (PageLocked(page))
+                        unlock_page(page);
+                page_cache_release(page);
+        }
+out2:
+        ext4_journal_stop(handle);
+        return ret < 0 ? ret : 0;
+}
+/**
+ * mext_check_argumants - Check whether move extent can be done
+ *
+ * @orig_inode:         original inode
+ * @donor_inode:        donor inode
+ * @orig_start:         logical start offset in block for orig
+ * @donor_start:        logical start offset in block for donor
+ * @len:                the number of blocks to be moved
+ * @moved_len:          moved block length
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+                          struct inode *donor_inode, __u64 orig_start,
+                          __u64 donor_start, __u64 *len, __u64 moved_len)
+{
+        /* Regular file check */
+        if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+                ext4_debug("ext4 move extent: The argument files should be "
+                        "regular file [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* Ext4 move extent does not support swapfile */
+        if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+                ext4_debug("ext4 move extent: The argument files should "
+                        "not be swapfile [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* Files should be in the same ext4 FS */
+        if (orig_inode->i_sb != donor_inode->i_sb) {
+                ext4_debug("ext4 move extent: The argument files "
+                        "should be in same FS [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* orig and donor should be different file */
+        if (orig_inode->i_ino == donor_inode->i_ino) {
+                ext4_debug("ext4 move extent: The argument files should not "
+                        "be same file [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* Ext4 move extent supports only extent based file */
+        if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+                ext4_debug("ext4 move extent: orig file is not extents "
+                        "based file [ino:orig %lu]\n", orig_inode->i_ino);
+                return -EOPNOTSUPP;
+        } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+                ext4_debug("ext4 move extent: donor file is not extents "
+                        "based file [ino:donor %lu]\n", donor_inode->i_ino);
+                return -EOPNOTSUPP;
+        }
+        if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+                ext4_debug("ext4 move extent: File size is 0 byte\n");
+                return -EINVAL;
+        }
+        /* Start offset should be same */
+        if (orig_start != donor_start) {
+                ext4_debug("ext4 move extent: orig and donor's start "
+                        "offset are not same [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        if (moved_len) {
+                ext4_debug("ext4 move extent: moved_len should be 0 "
+                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+                        donor_inode->i_ino);
+                return -EINVAL;
+        }
+        if ((orig_start > MAX_DEFRAG_SIZE) ||
+            (donor_start > MAX_DEFRAG_SIZE) ||
+            (*len > MAX_DEFRAG_SIZE) ||
+            (orig_start + *len > MAX_DEFRAG_SIZE))  {
+                ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
+                        "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        if (orig_inode->i_size > donor_inode->i_size) {
+                if (orig_start >= donor_inode->i_size) {
+                        ext4_debug("ext4 move extent: orig start offset "
+                        "[%llu] should be less than donor file size "
+                        "[%lld] [ino:orig %lu, donor_inode %lu]\n",
+                        orig_start, donor_inode->i_size,
+                        orig_inode->i_ino, donor_inode->i_ino);
+                        return -EINVAL;
+                }
+                if (orig_start + *len > donor_inode->i_size) {
+                        ext4_debug("ext4 move extent: End offset [%llu] should "
+                                "be less than donor file size [%lld]."
+                                "So adjust length from %llu to %lld "
+                                "[ino:orig %lu, donor %lu]\n",
+                                orig_start + *len, donor_inode->i_size,
+                                *len, donor_inode->i_size - orig_start,
+                                orig_inode->i_ino, donor_inode->i_ino);
+                        *len = donor_inode->i_size - orig_start;
+                }
+        } else {
+                if (orig_start >= orig_inode->i_size) {
+                        ext4_debug("ext4 move extent: start offset [%llu] "
+                                "should be less than original file size "
+                                "[%lld] [inode:orig %lu, donor %lu]\n",
+                                 orig_start, orig_inode->i_size,
+                                orig_inode->i_ino, donor_inode->i_ino);
+                        return -EINVAL;
+                }
+                if (orig_start + *len > orig_inode->i_size) {
+                        ext4_debug("ext4 move extent: Adjust length "
+                                "from %llu to %lld. Because it should be "
+                                "less than original file size "
+                                "[ino:orig %lu, donor %lu]\n",
+                                *len, orig_inode->i_size - orig_start,
+                                orig_inode->i_ino, donor_inode->i_ino);
+                        *len = orig_inode->i_size - orig_start;
+                }
+        }
+        if (!*len) {
+                ext4_debug("ext4 move extent: len shoudld not be 0 "
+                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+                        donor_inode->i_ino);
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:     the inode structure
+ * @inode2:     the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order. This function is moved from
+ * fs/inode.c.
+ */
+static void
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+        if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+                if (inode1)
+                        mutex_lock(&inode1->i_mutex);
+                else if (inode2)
+                        mutex_lock(&inode2->i_mutex);
+                return;
+        }
+        if (inode1->i_ino < inode2->i_ino) {
+                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+        } else {
+                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+        }
+}
+/**
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:     the inode that is released first
+ * @inode2:     the inode that is released second
+ *
+ * This function is moved from fs/inode.c.
+ */
+static void
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+        if (inode1)
+                mutex_unlock(&inode1->i_mutex);
+        if (inode2 && inode2 != inode1)
+                mutex_unlock(&inode2->i_mutex);
+}
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp:             file structure of the original file
+ * @d_filp:             file structure of the donor file
+ * @orig_start:         start offset in block for orig
+ * @donor_start:        start offset in block for donor
+ * @len:                the number of blocks to be moved
+ * @moved_len:          moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ *   function by the start block number (orig_start) and the number of blocks
+ *   to be moved (len) specified as arguments.
+ *   If the {orig, donor}_start points a hole, the extent's start offset
+ *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ *   after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ *   from orig_page_offset to seq_end_page.
+ *   The start indexes of data are specified as arguments.
+ *   That of the original inode is orig_page_offset,
+ *   and the donor inode is also orig_page_offset
+ *   (To easily handle blocksize != pagesize case, the offset for the
+ *   donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ *   then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ *   which shows the number of moved blocks.
+ *   The moved_len is useful for the command to calculate the file offset
+ *   for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+                 __u64 orig_start, __u64 donor_start, __u64 len,
+                 __u64 *moved_len)
+{
+        struct inode *orig_inode = o_filp->f_dentry->d_inode;
+        struct inode *donor_inode = d_filp->f_dentry->d_inode;
+        struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+        struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+        ext4_lblk_t block_start = orig_start;
+        ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+        ext4_lblk_t rest_blocks;
+        pgoff_t orig_page_offset = 0, seq_end_page;
+        int ret, depth, last_extent = 0;
+        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+        int data_offset_in_page;
+        int block_len_in_page;
+        int uninit;
+        /* protect orig and donor against a truncate */
+        mext_inode_double_lock(orig_inode, donor_inode);
+        mext_double_down_read(orig_inode, donor_inode);
+        /* Check the filesystem environment whether move_extent can be done */
+        ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+                                        donor_start, &len, *moved_len);
+        mext_double_up_read(orig_inode, donor_inode);
+        if (ret)
+                goto out2;
+        file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+        block_end = block_start + len - 1;
+        if (file_end < block_end)
+                len -= block_end - file_end;
+        get_ext_path(orig_path, orig_inode, block_start, ret);
+        if (orig_path == NULL)
+                goto out2;
+        /* Get path structure to check the hole */
+        get_ext_path(holecheck_path, orig_inode, block_start, ret);
+        if (holecheck_path == NULL)
+                goto out;
+        depth = ext_depth(orig_inode);
+        ext_cur = holecheck_path[depth].p_ext;
+        if (ext_cur == NULL) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * Get proper extent whose ee_block is beyond block_start
+         * if block_start was within the hole.
+         */
+        if (le32_to_cpu(ext_cur->ee_block) +
+                ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+                last_extent = mext_next_extent(orig_inode,
+                                        holecheck_path, &ext_cur);
+                if (last_extent < 0) {
+                        ret = last_extent;
+                        goto out;
+                }
+                last_extent = mext_next_extent(orig_inode, orig_path,
+                                                        &ext_dummy);
+                if (last_extent < 0) {
+                        ret = last_extent;
+                        goto out;
+                }
+        }
+        seq_start = block_start;
+        /* No blocks within the specified range. */
+        if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+                ext4_debug("ext4 move extent: The specified range of file "
+                                                        "may be the hole\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        /* Adjust start blocks */
+        add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+                         ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+                     max(le32_to_cpu(ext_cur->ee_block), block_start);
+        while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+                seq_blocks += add_blocks;
+                /* Adjust tail blocks */
+                if (seq_start + seq_blocks - 1 > block_end)
+                        seq_blocks = block_end - seq_start + 1;
+                ext_prev = ext_cur;
+                last_extent = mext_next_extent(orig_inode, holecheck_path,
+                                                &ext_cur);
+                if (last_extent < 0) {
+                        ret = last_extent;
+                        break;
+                }
+                add_blocks = ext4_ext_get_actual_len(ext_cur);
+                /*
+                 * Extend the length of contiguous block (seq_blocks)
+                 * if extents are contiguous.
+                 */
+                if (ext4_can_extents_be_merged(orig_inode,
+                                               ext_prev, ext_cur) &&
+                    block_end >= le32_to_cpu(ext_cur->ee_block) &&
+                    !last_extent)
+                        continue;
+                /* Is original extent is uninitialized */
+                uninit = ext4_ext_is_uninitialized(ext_prev);
+                data_offset_in_page = seq_start % blocks_per_page;
+                /*
+                 * Calculate data blocks count that should be swapped
+                 * at the first page.
+                 */
+                if (data_offset_in_page + seq_blocks > blocks_per_page) {
+                        /* Swapped blocks are across pages */
+                        block_len_in_page =
+                                        blocks_per_page - data_offset_in_page;
+                } else {
+                        /* Swapped blocks are in a page */
+                        block_len_in_page = seq_blocks;
+                }
+                orig_page_offset = seq_start >>
+                                (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+                seq_end_page = (seq_start + seq_blocks - 1) >>
+                                (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+                seq_start = le32_to_cpu(ext_cur->ee_block);
+                rest_blocks = seq_blocks;
+                /* Discard preallocations of two inodes */
+                down_write(&EXT4_I(orig_inode)->i_data_sem);
+                ext4_discard_preallocations(orig_inode);
+                up_write(&EXT4_I(orig_inode)->i_data_sem);
+                down_write(&EXT4_I(donor_inode)->i_data_sem);
+                ext4_discard_preallocations(donor_inode);
+                up_write(&EXT4_I(donor_inode)->i_data_sem);
+                while (orig_page_offset <= seq_end_page) {
+                        /* Swap original branches with new branches */
+                        ret = move_extent_par_page(o_filp, donor_inode,
+                                                orig_page_offset,
+                                                data_offset_in_page,
+                                                block_len_in_page, uninit);
+                        if (ret < 0)
+                                goto out;
+                        orig_page_offset++;
+                        /* Count how many blocks we have exchanged */
+                        *moved_len += block_len_in_page;
+                        BUG_ON(*moved_len > len);
+                        data_offset_in_page = 0;
+                        rest_blocks -= block_len_in_page;
+                        if (rest_blocks > blocks_per_page)
+                                block_len_in_page = blocks_per_page;
+                        else
+                                block_len_in_page = rest_blocks;
+                }
+                /* Decrease buffer counter */
+                if (holecheck_path)
+                        ext4_ext_drop_refs(holecheck_path);
+                get_ext_path(holecheck_path, orig_inode,
+                                      seq_start, ret);
+                if (holecheck_path == NULL)
+                        break;
+                depth = holecheck_path->p_depth;
+                /* Decrease buffer counter */
+                if (orig_path)
+                        ext4_ext_drop_refs(orig_path);
+                get_ext_path(orig_path, orig_inode, seq_start, ret);
+                if (orig_path == NULL)
+                        break;
+                ext_cur = holecheck_path[depth].p_ext;
+                add_blocks = ext4_ext_get_actual_len(ext_cur);
+                seq_blocks = 0;
+        }
+out:
+        if (orig_path) {
+                ext4_ext_drop_refs(orig_path);
+                kfree(orig_path);
+        }
+        if (holecheck_path) {
+                ext4_ext_drop_refs(holecheck_path);
+                kfree(holecheck_path);
+        }
+out2:
+        mext_inode_double_unlock(orig_inode, donor_inode);
+        if (ret)
+                return ret;
+        /* All of the specified blocks must be exchanged in succeed */
+        BUG_ON(*moved_len != len);
+        return 0;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 07eb6649e4fa..de04013d16ff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1782,7 +1782,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode (handle, dir, mode);
+        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, mode);
+        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,8 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
+        inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+                               &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2264,7 +2265,8 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
+        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+                               &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 27eb289eea37..68b0351fc647 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1002,7 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                        " too large to resize to %llu blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
-                        ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
+                        ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
                return -EINVAL;
        }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 012c4251397e..8f4f079e6b9a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,7 +37,6 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/ctype.h>
-#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -47,6 +46,9 @@
 #include "xattr.h"
 #include "acl.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
 static int default_mb_history_length = 1000;
 module_param_named(default_mb_history_length, default_mb_history_length,
@@ -301,7 +303,7 @@ static void ext4_handle_error(struct super_block *sb)
        if (!test_opt(sb, ERRORS_CONT)) {
                journal_t *journal = EXT4_SB(sb)->s_journal;
-                EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
                if (journal)
                        jbd2_journal_abort(journal, -EIO);
        }
@@ -414,7 +416,7 @@ void ext4_abort(struct super_block *sb, const char *function,
        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
-        EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+        EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
        if (EXT4_SB(sb)->s_journal)
                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
@@ -664,10 +666,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        if (!ei)
                return NULL;
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        ei->i_acl = EXT4_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
        ei->vfs_inode.i_version = 1;
        ei->vfs_inode.i_data.writeback_index = 0;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -733,18 +731,6 @@ static void destroy_inodecache(void)
 static void ext4_clear_inode(struct inode *inode)
 {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        if (EXT4_I(inode)->i_acl &&
-                        EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
-                posix_acl_release(EXT4_I(inode)->i_acl);
-                EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
-        }
-        if (EXT4_I(inode)->i_default_acl &&
-                        EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
-                posix_acl_release(EXT4_I(inode)->i_default_acl);
-                EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
-        }
-#endif
        ext4_discard_preallocations(inode);
        if (EXT4_JOURNAL(inode))
                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -1474,7 +1460,7 @@ set_qf_format:
                        break;
 #endif
                case Opt_abort:
-                        set_opt(sbi->s_mount_opt, ABORT);
+                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
                        clear_opt(sbi->s_mount_opt, BARRIER);
@@ -1653,7 +1639,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
@@ -1957,7 +1943,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
        /* small i_blocks in vfs inode? */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * CONFIG_LBD is not enabled implies the inode
+                 * CONFIG_LBDAF is not enabled implies the inode
                 * i_block represent total blocks in 512 bytes
                 * 32 == size of vfs inode i_blocks * 8
                 */
@@ -2000,7 +1986,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * !has_huge_files or CONFIG_LBD not enabled implies that
+                 * !has_huge_files or CONFIG_LBDAF not enabled implies that
                 * the inode i_block field represents total file blocks in
                 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
                 */
@@ -2204,6 +2190,7 @@ EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2216,6 +2203,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
        ATTR_LIST(inode_readahead_blks),
+        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
        ATTR_LIST(mb_max_to_scan),
        ATTR_LIST(mb_min_to_scan),
@@ -2436,13 +2424,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (has_huge_files) {
                /*
                 * Large file size enabled file system can only be
-                 * mount if kernel is build with CONFIG_LBD
+                 * mount if kernel is build with CONFIG_LBDAF
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
                        ext4_msg(sb, KERN_ERR, "Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LBD");
+                                        "without CONFIG_LBDAF");
                        goto failed_mount;
                }
        }
@@ -2566,7 +2554,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                ext4_msg(sb, KERN_ERR, "filesystem"
                        " too large to mount safely");
                if (sizeof(sector_t) < 8)
-                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
+                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
                goto failed_mount;
        }
@@ -3346,7 +3334,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        int ret = 0;
        tid_t target;
-        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
+        trace_ext4_sync_fs(sb, wait);
        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
                if (wait)
                        jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
@@ -3450,7 +3438,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                goto restore_opts;
        }
-        if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
                ext4_abort(sb, __func__, "Abort forced by user");
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -3465,7 +3453,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
                n_blocks_count > ext4_blocks_count(es)) {
-                if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+                if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
                        err = -EROFS;
                        goto restore_opts;
                }
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index b42602298087..923990e4f16e 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -241,7 +241,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
-                        fat_fs_panic(sb, "%s: detected the cluster chain loop"
+                        fat_fs_error(sb, "%s: detected the cluster chain loop"
                                     " (i_pos %lld)", __func__,
                                     MSDOS_I(inode)->i_pos);
                        nr = -EIO;
@@ -252,7 +252,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                        fat_fs_panic(sb, "%s: invalid cluster chain"
+                        fat_fs_error(sb, "%s: invalid cluster chain"
                                     " (i_pos %lld)", __func__,
                                     MSDOS_I(inode)->i_pos);
                        nr = -EIO;
@@ -285,7 +285,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
        if (ret < 0)
                return ret;
        else if (ret == FAT_ENT_EOF) {
-                fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)",
+                fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
                             __func__, MSDOS_I(inode)->i_pos);
                return -EIO;
        }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index f3500294eec5..530b4ca01510 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,12 +16,24 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
 #include "fat.h"
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE      ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS       ((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
 static inline loff_t fat_make_i_pos(struct super_block *sb,
                                    struct buffer_head *bh,
                                    struct msdos_dir_entry *de)
@@ -171,7 +183,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
                                unsigned char *buf, int size)
 {
        if (sbi->options.utf8)
-                return utf8_wcstombs(buf, uni, size);
+                return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+                                UTF16_HOST_ENDIAN, buf, size);
        else
                return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
                                   sbi->nls_io);
@@ -325,19 +338,6 @@ parse_long:
 }
 /*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE      ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS       ((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-/*
 * Return values: negative -> error, 0 -> not found, positive -> found,
 * value is the total amount of slots, including the shortname entry.
 */
@@ -1334,7 +1334,7 @@ found:
                        goto error_remove;
                }
                if (dir->i_size & (sbi->cluster_size - 1)) {
-                        fat_fs_panic(sb, "Odd directory size");
+                        fat_fs_error(sb, "Odd directory size");
                        dir->i_size = (dir->i_size + sbi->cluster_size - 1)
                                & ~((loff_t)sbi->cluster_size - 1);
                }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e4d88527b5dd..adb0e72a176d 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -17,6 +17,10 @@
 #define VFAT_SFN_CREATE_WIN95   0x0100 /* emulate win95 rule for create */
 #define VFAT_SFN_CREATE_WINNT   0x0200 /* emulate winnt rule for create */
+#define FAT_ERRORS_CONT         1      /* ignore error and continue */
+#define FAT_ERRORS_PANIC        2      /* panic on error */
+#define FAT_ERRORS_RO           3      /* remount r/o on error */
 struct fat_mount_options {
        uid_t fs_uid;
        gid_t fs_gid;
@@ -26,6 +30,7 @@ struct fat_mount_options {
        char *iocharset;          /* Charset used for filename input/display */
        unsigned short shortname; /* flags for shortname display/create rule */
        unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+        unsigned char errors;     /* On error: continue, panic, remount-ro */
        unsigned short allow_utime;/* permission for setting the [am]time */
        unsigned quiet:1,         /* set = fake successful chmods and chowns */
                 showexec:1,      /* set = only set x bit for com/exe/bat */
@@ -316,7 +321,7 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3))) __cold;
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 618f5305c2e4..a81037721a6f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -348,7 +348,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
        if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
                fatent_brelse(fatent);
-                fat_fs_panic(sb, "invalid access to FAT (entry 0x%08x)", entry);
+                fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
                return -EIO;
        }
@@ -560,7 +560,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        err = cluster;
                        goto error;
                } else if (cluster == FAT_ENT_FREE) {
-                        fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF",
+                        fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
                                     __func__);
                        err = -EIO;
                        goto error;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e955a56b4e5e..f042b965c95c 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -18,106 +18,112 @@
 #include <linux/security.h>
 #include "fat.h"
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
+static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
-                      unsigned int cmd, unsigned long arg)
 {
+        u32 attr;
+        mutex_lock(&inode->i_mutex);
+        attr = fat_make_attrs(inode);
+        mutex_unlock(&inode->i_mutex);
+        return put_user(attr, user_attr);
+}
+static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-        u32 __user *user_attr = (u32 __user *)arg;
+        int is_dir = S_ISDIR(inode->i_mode);
+        u32 attr, oldattr;
+        struct iattr ia;
+        int err;
-        switch (cmd) {
+        err = get_user(attr, user_attr);
-        case FAT_IOCTL_GET_ATTRIBUTES:
+        if (err)
-        {
+                goto out;
-                u32 attr;
-                mutex_lock(&inode->i_mutex);
+        mutex_lock(&inode->i_mutex);
-                attr = fat_make_attrs(inode);
+        err = mnt_want_write(file->f_path.mnt);
-                mutex_unlock(&inode->i_mutex);
+        if (err)
+                goto out_unlock_inode;
-                return put_user(attr, user_attr);
+        /*
+         * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
+         * prevents the user from turning us into a VFAT
+         * longname entry.  Also, we obviously can't set
+         * any of the NTFS attributes in the high 24 bits.
+         */
+        attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
+        /* Merge in ATTR_VOLUME and ATTR_DIR */
+        attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
+                (is_dir ? ATTR_DIR : 0);
+        oldattr = fat_make_attrs(inode);
+        /* Equivalent to a chmod() */
+        ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+        ia.ia_ctime = current_fs_time(inode->i_sb);
+        if (is_dir)
+                ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+        else {
+                ia.ia_mode = fat_make_mode(sbi, attr,
+                        S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
        }
-        case FAT_IOCTL_SET_ATTRIBUTES:
-        {
-                u32 attr, oldattr;
-                int err, is_dir = S_ISDIR(inode->i_mode);
-                struct iattr ia;
-                err = get_user(attr, user_attr);
+        /* The root directory has no attributes */
-                if (err)
+        if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
-                        return err;
+                err = -EINVAL;
+                goto out_drop_write;
+        }
-                mutex_lock(&inode->i_mutex);
+        if (sbi->options.sys_immutable &&
+            ((attr | oldattr) & ATTR_SYS) &&
-                err = mnt_want_write(filp->f_path.mnt);
+            !capable(CAP_LINUX_IMMUTABLE)) {
-                if (err)
+                err = -EPERM;
-                        goto up_no_drop_write;
+                goto out_drop_write;
+        }
-                /*
-                 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
-                 * prevents the user from turning us into a VFAT
-                 * longname entry.  Also, we obviously can't set
-                 * any of the NTFS attributes in the high 24 bits.
-                 */
-                attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
-                /* Merge in ATTR_VOLUME and ATTR_DIR */
-                attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
-                        (is_dir ? ATTR_DIR : 0);
-                oldattr = fat_make_attrs(inode);
-                /* Equivalent to a chmod() */
-                ia.ia_valid = ATTR_MODE | ATTR_CTIME;
-                ia.ia_ctime = current_fs_time(inode->i_sb);
-                if (is_dir)
-                        ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
-                else {
-                        ia.ia_mode = fat_make_mode(sbi, attr,
-                                S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
-                }
-                /* The root directory has no attributes */
+        /*
-                if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
+         * The security check is questionable...  We single
-                        err = -EINVAL;
+         * out the RO attribute for checking by the security
-                        goto up;
+         * module, just because it maps to a file mode.
-                }
+         */
+        err = security_inode_setattr(file->f_path.dentry, &ia);
+        if (err)
+                goto out_drop_write;
-                if (sbi->options.sys_immutable) {
+        /* This MUST be done before doing anything irreversible... */
-                        if ((attr | oldattr) & ATTR_SYS) {
+        err = fat_setattr(file->f_path.dentry, &ia);
-                                if (!capable(CAP_LINUX_IMMUTABLE)) {
+        if (err)
-                                        err = -EPERM;
+                goto out_drop_write;
-                                        goto up;
-                                }
+        fsnotify_change(file->f_path.dentry, ia.ia_valid);
-                        }
+        if (sbi->options.sys_immutable) {
-                }
+                if (attr & ATTR_SYS)
+                        inode->i_flags |= S_IMMUTABLE;
+                else
+                        inode->i_flags &= S_IMMUTABLE;
+        }
-                /*
+        fat_save_attrs(inode, attr);
-                 * The security check is questionable...  We single
+        mark_inode_dirty(inode);
-                 * out the RO attribute for checking by the security
+out_drop_write:
-                 * module, just because it maps to a file mode.
+        mnt_drop_write(file->f_path.mnt);
-                 */
+out_unlock_inode:
-                err = security_inode_setattr(filp->f_path.dentry, &ia);
+        mutex_unlock(&inode->i_mutex);
-                if (err)
+out:
-                        goto up;
+        return err;
+}
-                /* This MUST be done before doing anything irreversible... */
-                err = fat_setattr(filp->f_path.dentry, &ia);
-                if (err)
-                        goto up;
-                fsnotify_change(filp->f_path.dentry, ia.ia_valid);
-                if (sbi->options.sys_immutable) {
-                        if (attr & ATTR_SYS)
-                                inode->i_flags |= S_IMMUTABLE;
-                        else
-                                inode->i_flags &= S_IMMUTABLE;
-                }
-                fat_save_attrs(inode, attr);
+int fat_generic_ioctl(struct inode *inode, struct file *filp,
-                mark_inode_dirty(inode);
+                      unsigned int cmd, unsigned long arg)
-up:
+{
-                mnt_drop_write(filp->f_path.mnt);
+        u32 __user *user_attr = (u32 __user *)arg;
-up_no_drop_write:
-                mutex_unlock(&inode->i_mutex);
+        switch (cmd) {
-                return err;
+        case FAT_IOCTL_GET_ATTRIBUTES:
-        }
+                return fat_ioctl_get_attributes(inode, user_attr);
+        case FAT_IOCTL_SET_ATTRIBUTES:
+                return fat_ioctl_set_attributes(filp, user_attr);
        default:
                return -ENOTTY; /* Inappropriate ioctl for device */
        }
@@ -128,7 +134,7 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        if ((filp->f_mode & FMODE_WRITE) &&
             MSDOS_SB(inode->i_sb)->options.flush) {
                fat_flush_inodes(inode->i_sb, inode, NULL);
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
        return 0;
 }
@@ -225,7 +231,7 @@ static int fat_free(struct inode *inode, int skip)
                        fatent_brelse(&fatent);
                        return 0;
                } else if (ret == FAT_ENT_FREE) {
-                        fat_fs_panic(sb,
+                        fat_fs_error(sb,
                                     "%s: invalid cluster chain (i_pos %lld)",
                                     __func__, MSDOS_I(inode)->i_pos);
                        ret = -EIO;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 51a5ecf9000a..8970d8c49bb0 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -76,7 +76,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
                return 0;
        if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
-                fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
+                fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
                        MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
                return -EIO;
        }
@@ -856,6 +856,12 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",flush");
        if (opts->tz_utc)
                seq_puts(m, ",tz=UTC");
+        if (opts->errors == FAT_ERRORS_CONT)
+                seq_puts(m, ",errors=continue");
+        else if (opts->errors == FAT_ERRORS_PANIC)
+                seq_puts(m, ",errors=panic");
+        else
+                seq_puts(m, ",errors=remount-ro");
        return 0;
 }
@@ -868,7 +874,8 @@ enum {
        Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
+        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
+        Opt_err_panic, Opt_err_ro, Opt_err,
 };
 static const match_table_t fat_tokens = {
@@ -891,6 +898,11 @@ static const match_table_t fat_tokens = {
        {Opt_showexec, "showexec"},
        {Opt_debug, "debug"},
        {Opt_immutable, "sys_immutable"},
+        {Opt_flush, "flush"},
+        {Opt_tz_utc, "tz=UTC"},
+        {Opt_err_cont, "errors=continue"},
+        {Opt_err_panic, "errors=panic"},
+        {Opt_err_ro, "errors=remount-ro"},
        {Opt_obsolate, "conv=binary"},
        {Opt_obsolate, "conv=text"},
        {Opt_obsolate, "conv=auto"},
@@ -902,8 +914,6 @@ static const match_table_t fat_tokens = {
        {Opt_obsolate, "cvf_format=%20s"},
        {Opt_obsolate, "cvf_options=%100s"},
        {Opt_obsolate, "posix"},
-        {Opt_flush, "flush"},
-        {Opt_tz_utc, "tz=UTC"},
        {Opt_err, NULL},
 };
 static const match_table_t msdos_tokens = {
@@ -956,7 +966,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->fs_uid = current_uid();
        opts->fs_gid = current_gid();
-        opts->fs_fmask = current_umask();
+        opts->fs_fmask = opts->fs_dmask = current_umask();
        opts->allow_utime = -1;
        opts->codepage = fat_default_codepage;
        opts->iocharset = fat_default_iocharset;
@@ -973,6 +983,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
        opts->tz_utc = 0;
+        opts->errors = FAT_ERRORS_RO;
        *debug = 0;
        if (!options)
@@ -1065,6 +1076,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                case Opt_tz_utc:
                        opts->tz_utc = 1;
                        break;
+                case Opt_err_cont:
+                        opts->errors = FAT_ERRORS_CONT;
+                        break;
+                case Opt_err_panic:
+                        opts->errors = FAT_ERRORS_PANIC;
+                        break;
+                case Opt_err_ro:
+                        opts->errors = FAT_ERRORS_RO;
+                        break;
                /* msdos specific */
                case Opt_dots:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index ac39ebcc1496..a6c20473dfd7 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -12,14 +12,19 @@
 #include "fat.h"
 /*
- * fat_fs_panic reports a severe file system problem and sets the file system
+ * fat_fs_error reports a file system problem that might indicate fa data
- * read-only. The file system can be made writable again by remounting it.
+ * corruption/inconsistency. Depending on 'errors' mount option the
+ * panic() is called, or error message is printed FAT and nothing is done,
+ * or filesystem is remounted read-only (default behavior).
+ * In case the file system is remounted read-only, it can be made writable
+ * again by remounting it.
 */
-void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+void fat_fs_error(struct super_block *s, const char *fmt, ...)
 {
+        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
        va_list args;
-        printk(KERN_ERR "FAT: Filesystem panic (dev %s)\n", s->s_id);
+        printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
        printk(KERN_ERR "    ");
        va_start(args, fmt);
@@ -27,13 +32,14 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
        va_end(args);
        printk("\n");
-        if (!(s->s_flags & MS_RDONLY)) {
+        if (opts->errors == FAT_ERRORS_PANIC)
+                panic("    FAT fs panic from previous error\n");
+        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
                s->s_flags |= MS_RDONLY;
                printk(KERN_ERR "    File system has been set read-only\n");
        }
 }
+EXPORT_SYMBOL_GPL(fat_fs_error);
-EXPORT_SYMBOL_GPL(fat_fs_panic);
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
@@ -124,7 +130,7 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
                        mark_inode_dirty(inode);
        }
        if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-                fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
+                fat_fs_error(sb, "clusters badly computed (%d != %llu)",
                             new_fclus,
                             (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
                fat_cache_inval_inode(inode);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 20f522861355..bbc94ae4fd77 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "fat.h"
 /* Characters that are undesirable in an MS-DOS file name */
@@ -608,7 +607,7 @@ error_inode:
                sinfo.bh = NULL;
        }
        if (corrupt < 0) {
-                fat_fs_panic(new_dir->i_sb,
+                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld)",
                             __func__, sinfo.i_pos);
        }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b50ecbe97f83..cb6e83557112 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
 #include "fat.h"
@@ -502,11 +501,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
        if (utf8) {
                int name_len = strlen(name);
-                *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+                *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
                /*
                 * We stripped '.'s before and set len appropriately,
-                 * but utf8_mbstowcs doesn't care about len
+                 * but utf8s_to_utf16s doesn't care about len
                 */
                *outlen -= (name_len - len);
@@ -1030,7 +1029,7 @@ error_inode:
                sinfo.bh = NULL;
        }
        if (corrupt < 0) {
-                fat_fs_panic(new_dir->i_sb,
+                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld)",
                             __func__, sinfo.i_pos);
        }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 1ad703150dee..ae413086db97 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -19,7 +19,6 @@
 #include <linux/signal.h>
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
-#include <linux/smp_lock.h>
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -198,15 +197,19 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 }
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
-                     uid_t uid, uid_t euid, int force)
+                     int force)
 {
        write_lock_irq(&filp->f_owner.lock);
        if (force || !filp->f_owner.pid) {
                put_pid(filp->f_owner.pid);
                filp->f_owner.pid = get_pid(pid);
                filp->f_owner.pid_type = type;
-                filp->f_owner.uid = uid;
-                filp->f_owner.euid = euid;
+                if (pid) {
+                        const struct cred *cred = current_cred();
+                        filp->f_owner.uid = cred->uid;
+                        filp->f_owner.euid = cred->euid;
+                }
        }
        write_unlock_irq(&filp->f_owner.lock);
 }
@@ -214,14 +217,13 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
 {
-        const struct cred *cred = current_cred();
        int err;
-        
        err = security_file_set_fowner(filp);
        if (err)
                return err;
-        f_modown(filp, pid, type, cred->uid, cred->euid, force);
+        f_modown(filp, pid, type, force);
        return 0;
 }
 EXPORT_SYMBOL(__f_setown);
@@ -247,7 +249,7 @@ EXPORT_SYMBOL(f_setown);
 void f_delown(struct file *filp)
 {
-        f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1);
+        f_modown(filp, NULL, PIDTYPE_PID, 1);
 }
 pid_t f_getown(struct file *filp)
@@ -425,14 +427,20 @@ static inline int sigio_perm(struct task_struct *p,
 }
 static void send_sigio_to_task(struct task_struct *p,
-                               struct fown_struct *fown, 
+                               struct fown_struct *fown,
                               int fd,
                               int reason)
 {
-        if (!sigio_perm(p, fown, fown->signum))
+        /*
+         * F_SETSIG can change ->signum lockless in parallel, make
+         * sure we read it once and use the same value throughout.
+         */
+        int signum = ACCESS_ONCE(fown->signum);
+        if (!sigio_perm(p, fown, signum))
                return;
-        switch (fown->signum) {
+        switch (signum) {
                siginfo_t si;
                default:
                        /* Queue a rt signal with the appropriate fd as its
@@ -441,7 +449,7 @@ static void send_sigio_to_task(struct task_struct *p,
                           delivered even if we can't queue.  Failure to
                           queue in this case _should_ be reported; we fall
                           back to SIGIO in that case. --sct */
-                        si.si_signo = fown->signum;
+                        si.si_signo = signum;
                        si.si_errno = 0;
                        si.si_code  = reason;
                        /* Make sure we are called with one of the POLL_*
@@ -453,7 +461,7 @@ static void send_sigio_to_task(struct task_struct *p,
                        else
                                si.si_band = band_table[reason - POLL_IN];
                        si.si_fd    = fd;
-                        if (!group_send_sig_info(fown->signum, &si, p))
+                        if (!group_send_sig_info(signum, &si, p))
                                break;
                /* fall-through: fall back on the old plain SIGIO signal */
                case 0:
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index cdbd1654e4cd..1e8af939b3e4 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,6 +38,7 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 40308e98c6a4..c54226be5294 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -278,7 +278,26 @@ int sb_has_dirty_inodes(struct super_block *sb)
 EXPORT_SYMBOL(sb_has_dirty_inodes);
 /*
- * Write a single inode's dirty pages and inode data out to disk.
+ * Wait for writeback on an inode to complete.
+ */
+static void inode_wait_for_writeback(struct inode *inode)
+{
+        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
+        wait_queue_head_t *wqh;
+        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+        do {
+                spin_unlock(&inode_lock);
+                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
+                spin_lock(&inode_lock);
+        } while (inode->i_state & I_SYNC);
+}
+/*
+ * Write out an inode's dirty pages.  Called under inode_lock.  Either the
+ * caller has ref on the inode (either via __iget or via syscall against an fd)
+ * or the inode has I_WILL_FREE set (via generic_forget_inode)
+ *
 * If `wait' is set, wait on the writeout.
 *
 * The whole writeout design is quite complex and fragile.  We want to avoid
@@ -288,13 +307,38 @@ EXPORT_SYMBOL(sb_has_dirty_inodes);
 * Called under inode_lock.
 */
 static int
-__sync_single_inode(struct inode *inode, struct writeback_control *wbc)
+writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        unsigned dirty;
        struct address_space *mapping = inode->i_mapping;
        int wait = wbc->sync_mode == WB_SYNC_ALL;
+        unsigned dirty;
        int ret;
+        if (!atomic_read(&inode->i_count))
+                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+        else
+                WARN_ON(inode->i_state & I_WILL_FREE);
+        if (inode->i_state & I_SYNC) {
+                /*
+                 * If this inode is locked for writeback and we are not doing
+                 * writeback-for-data-integrity, move it to s_more_io so that
+                 * writeback can proceed with the other inodes on s_io.
+                 *
+                 * We'll have another go at writing back this inode when we
+                 * completed a full scan of s_io.
+                 */
+                if (!wait) {
+                        requeue_io(inode);
+                        return 0;
+                }
+                /*
+                 * It's a data-integrity sync.  We must wait.
+                 */
+                inode_wait_for_writeback(inode);
+        }
        BUG_ON(inode->i_state & I_SYNC);
        /* Set I_SYNC, reset I_DIRTY */
@@ -321,7 +365,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_lock(&inode_lock);
        inode->i_state &= ~I_SYNC;
-        if (!(inode->i_state & I_FREEING)) {
+        if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
                if (!(inode->i_state & I_DIRTY) &&
                    mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                        /*
@@ -390,50 +434,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 }
 /*
- * Write out an inode's dirty pages.  Called under inode_lock.  Either the
- * caller has ref on the inode (either via __iget or via syscall against an fd)
- * or the inode has I_WILL_FREE set (via generic_forget_inode)
- */
-static int
-__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
-{
-        wait_queue_head_t *wqh;
-        if (!atomic_read(&inode->i_count))
-                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
-        else
-                WARN_ON(inode->i_state & I_WILL_FREE);
-        if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) {
-                /*
-                 * We're skipping this inode because it's locked, and we're not
-                 * doing writeback-for-data-integrity.  Move it to s_more_io so
-                 * that writeback can proceed with the other inodes on s_io.
-                 * We'll have another go at writing back this inode when we
-                 * completed a full scan of s_io.
-                 */
-                requeue_io(inode);
-                return 0;
-        }
-        /*
-         * It's a data-integrity sync.  We must wait.
-         */
-        if (inode->i_state & I_SYNC) {
-                DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
-                wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-                do {
-                        spin_unlock(&inode_lock);
-                        __wait_on_bit(wqh, &wq, inode_wait,
-                                                        TASK_UNINTERRUPTIBLE);
-                        spin_lock(&inode_lock);
-                } while (inode->i_state & I_SYNC);
-        }
-        return __sync_single_inode(inode, wbc);
-}
-/*
 * Write out a superblock's list of dirty inodes.  A wait will be performed
 * upon no inodes, all inodes or the final one, depending upon sync_mode.
 *
@@ -492,7 +492,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
                        break;
                }
-                if (inode->i_state & I_NEW) {
+                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
@@ -523,10 +523,10 @@ void generic_sync_sb_inodes(struct super_block *sb,
                if (current_is_pdflush() && !writeback_acquire(bdi))
                        break;
-                BUG_ON(inode->i_state & I_FREEING);
+                BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
-                __writeback_single_inode(inode, wbc);
+                writeback_single_inode(inode, wbc);
                if (current_is_pdflush())
                        writeback_release(bdi);
                if (wbc->pages_skipped != pages_skipped) {
@@ -708,7 +708,7 @@ int write_inode_now(struct inode *inode, int sync)
        might_sleep();
        spin_lock(&inode_lock);
-        ret = __writeback_single_inode(inode, &wbc);
+        ret = writeback_single_inode(inode, &wbc);
        spin_unlock(&inode_lock);
        if (sync)
                inode_sync_wait(inode);
@@ -732,7 +732,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        int ret;
        spin_lock(&inode_lock);
-        ret = __writeback_single_inode(inode, wbc);
+        ret = writeback_single_inode(inode, wbc);
        spin_unlock(&inode_lock);
        return ret;
 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8fed2ed12f38..6484eb75acd6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -286,8 +286,8 @@ __releases(&fc->lock)
                }
                if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
                    fc->connected && fc->bdi_initialized) {
-                        clear_bdi_congested(&fc->bdi, READ);
+                        clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
-                        clear_bdi_congested(&fc->bdi, WRITE);
+                        clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
                }
                fc->num_background--;
                fc->active_background--;
@@ -414,8 +414,8 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
                fc->blocked = 1;
        if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
            fc->bdi_initialized) {
-                set_bdi_congested(&fc->bdi, READ);
+                set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
-                set_bdi_congested(&fc->bdi, WRITE);
+                set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
        }
        list_add_tail(&req->list, &fc->bg_queue);
        flush_bg_queue(fc);
@@ -849,6 +849,81 @@ err:
        return err;
 }
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+                                   struct fuse_copy_state *cs)
+{
+        struct fuse_notify_inval_inode_out outarg;
+        int err = -EINVAL;
+        if (size != sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto err_unlock;
+        err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+                                       outarg.off, outarg.len);
+err_unlock:
+        up_read(&fc->killsb);
+        return err;
+err:
+        fuse_copy_finish(cs);
+        return err;
+}
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+                                   struct fuse_copy_state *cs)
+{
+        struct fuse_notify_inval_entry_out outarg;
+        int err = -EINVAL;
+        char buf[FUSE_NAME_MAX+1];
+        struct qstr name;
+        if (size < sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        err = -ENAMETOOLONG;
+        if (outarg.namelen > FUSE_NAME_MAX)
+                goto err;
+        name.name = buf;
+        name.len = outarg.namelen;
+        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        buf[outarg.namelen] = 0;
+        name.hash = full_name_hash(name.name, name.len);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto err_unlock;
+        err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+err_unlock:
+        up_read(&fc->killsb);
+        return err;
+err:
+        fuse_copy_finish(cs);
+        return err;
+}
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -856,6 +931,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
        case FUSE_NOTIFY_POLL:
                return fuse_notify_poll(fc, size, cs);
+        case FUSE_NOTIFY_INVAL_INODE:
+                return fuse_notify_inval_inode(fc, size, cs);
+        case FUSE_NOTIFY_INVAL_ENTRY:
+                return fuse_notify_inval_entry(fc, size, cs);
        default:
                fuse_copy_finish(cs);
                return -EINVAL;
@@ -910,7 +991,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
        int err;
-        unsigned nbytes = iov_length(iov, nr_segs);
+        size_t nbytes = iov_length(iov, nr_segs);
        struct fuse_req *req;
        struct fuse_out_header oh;
        struct fuse_copy_state cs;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b3089a083d30..e703654e7f40 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -375,7 +375,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
        struct fuse_req *forget_req;
-        struct fuse_open_in inarg;
+        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
        struct fuse_file *ff;
@@ -399,15 +399,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!ff)
                goto out_put_request;
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        flags &= ~O_NOCTTY;
        memset(&inarg, 0, sizeof(inarg));
        memset(&outentry, 0, sizeof(outentry));
        inarg.flags = flags;
        inarg.mode = mode;
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_CREATE;
        req->in.h.nodeid = get_node_id(dir);
        req->in.numargs = 2;
-        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
+                                                sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->in.args[1].size = entry->d_name.len + 1;
        req->in.args[1].value = entry->d_name.name;
@@ -546,12 +551,17 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
        if (IS_ERR(req))
                return PTR_ERR(req);
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        memset(&inarg, 0, sizeof(inarg));
        inarg.mode = mode;
        inarg.rdev = new_encode_dev(rdev);
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_MKNOD;
        req->in.numargs = 2;
-        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
+                                                sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->in.args[1].size = entry->d_name.len + 1;
        req->in.args[1].value = entry->d_name.name;
@@ -578,8 +588,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
        if (IS_ERR(req))
                return PTR_ERR(req);
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        memset(&inarg, 0, sizeof(inarg));
        inarg.mode = mode;
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_MKDIR;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
@@ -845,6 +859,43 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
        return err;
 }
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+                             struct qstr *name)
+{
+        int err = -ENOTDIR;
+        struct inode *parent;
+        struct dentry *dir;
+        struct dentry *entry;
+        parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+        if (!parent)
+                return -ENOENT;
+        mutex_lock(&parent->i_mutex);
+        if (!S_ISDIR(parent->i_mode))
+                goto unlock;
+        err = -ENOENT;
+        dir = d_find_alias(parent);
+        if (!dir)
+                goto unlock;
+        entry = d_lookup(dir, name);
+        dput(dir);
+        if (!entry)
+                goto unlock;
+        fuse_invalidate_attr(parent);
+        fuse_invalidate_entry(entry);
+        dput(entry);
+        err = 0;
+ unlock:
+        mutex_unlock(&parent->i_mutex);
+        iput(parent);
+        return err;
+}
 /*
 * Calling into a user-controlled filesystem gives the filesystem
 * daemon ptrace-like capabilities over the requester process.  This
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fce6ce694fde..cbc464043b6f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1922,7 +1922,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
        req = fuse_get_req(fc);
        if (IS_ERR(req))
-                return PTR_ERR(req);
+                return POLLERR;
        req->in.h.opcode = FUSE_POLL;
        req->in.h.nodeid = ff->nodeid;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index aaf2f9ff970e..52b641fc0faf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -446,6 +446,9 @@ struct fuse_conn {
        /** Do multi-page cached writes */
        unsigned big_writes:1;
+        /** Don't apply umask to creation modes */
+        unsigned dont_mask:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -481,6 +484,12 @@ struct fuse_conn {
        /** Called on final put */
        void (*release)(struct fuse_conn *);
+        /** Super block for this connection. */
+        struct super_block *sb;
+        /** Read/write semaphore to hold when accessing sb. */
+        struct rw_semaphore killsb;
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -509,6 +518,11 @@ extern const struct file_operations fuse_dev_operations;
 extern const struct dentry_operations fuse_dentry_operations;
 /**
+ * Inode to nodeid comparison.
+ */
+int fuse_inode_eq(struct inode *inode, void *_nodeidp);
+/**
 * Get a filled in inode
 */
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -708,6 +722,19 @@ void fuse_release_nowrite(struct inode *inode);
 u64 fuse_get_attr_version(struct fuse_conn *fc);
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+                             loff_t offset, loff_t len);
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ */
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+                             struct qstr *name);
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 bool isdir);
 ssize_t fuse_direct_io(struct file *file, const char __user *buf,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f0df55a52929..f91ccc4a189d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,7 +19,6 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -207,7 +206,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
                BUG();
 }
-static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+int fuse_inode_eq(struct inode *inode, void *_nodeidp)
 {
        u64 nodeid = *(u64 *) _nodeidp;
        if (get_node_id(inode) == nodeid)
@@ -258,11 +257,34 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
        return inode;
 }
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+                             loff_t offset, loff_t len)
+{
+        struct inode *inode;
+        pgoff_t pg_start;
+        pgoff_t pg_end;
+        inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+        if (!inode)
+                return -ENOENT;
+        fuse_invalidate_attr(inode);
+        if (offset >= 0) {
+                pg_start = offset >> PAGE_CACHE_SHIFT;
+                if (len <= 0)
+                        pg_end = -1;
+                else
+                        pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+                invalidate_inode_pages2_range(inode->i_mapping,
+                                              pg_start, pg_end);
+        }
+        iput(inode);
+        return 0;
+}
 static void fuse_umount_begin(struct super_block *sb)
 {
-        lock_kernel();
        fuse_abort_conn(get_fuse_conn_super(sb));
-        unlock_kernel();
 }
 static void fuse_send_destroy(struct fuse_conn *fc)
@@ -483,6 +505,7 @@ void fuse_conn_init(struct fuse_conn *fc)
        memset(fc, 0, sizeof(*fc));
        spin_lock_init(&fc->lock);
        mutex_init(&fc->inst_mutex);
+        init_rwsem(&fc->killsb);
        atomic_set(&fc->count, 1);
        init_waitqueue_head(&fc->waitq);
        init_waitqueue_head(&fc->blocked_waitq);
@@ -728,6 +751,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                        }
                        if (arg->flags & FUSE_BIG_WRITES)
                                fc->big_writes = 1;
+                        if (arg->flags & FUSE_DONT_MASK)
+                                fc->dont_mask = 1;
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
@@ -751,7 +776,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
+                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -863,10 +888,16 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        fuse_conn_init(fc);
        fc->dev = sb->s_dev;
+        fc->sb = sb;
        err = fuse_bdi_init(fc, sb);
        if (err)
                goto err_put_conn;
+        /* Handle umasking inside the fuse code */
+        if (sb->s_flags & MS_POSIXACL)
+                fc->dont_mask = 1;
+        sb->s_flags |= MS_POSIXACL;
        fc->release = fuse_free_conn;
        fc->flags = d.flags;
        fc->user_id = d.user_id;
@@ -944,12 +975,25 @@ static int fuse_get_sb(struct file_system_type *fs_type,
        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc) {
+                down_write(&fc->killsb);
+                fc->sb = NULL;
+                up_write(&fc->killsb);
+        }
+        kill_anon_super(sb);
+}
 static struct file_system_type fuse_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuse",
        .fs_flags       = FS_HAS_SUBTYPE,
        .get_sb         = fuse_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = fuse_kill_sb_anon,
 };
 #ifdef CONFIG_BLOCK
@@ -961,11 +1005,24 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type,
                           mnt);
 }
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc) {
+                down_write(&fc->killsb);
+                fc->sb = NULL;
+                up_write(&fc->killsb);
+        }
+        kill_block_super(sb);
+}
 static struct file_system_type fuseblk_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuseblk",
        .get_sb         = fuse_get_sb_blk,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = fuse_kill_sb_blk,
        .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cad957cdb1e5..5971359d2090 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL && (64BIT || LBD)
+        depends on EXPERIMENTAL && (64BIT || LBDAF)
        select DLM if GFS2_FS_LOCKING_DLM
        select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
        select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 03ebb439ace0..7ebae9a4ecc0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
@@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        error = gfs2_glock_nq(&ip->i_gh);
        if (unlikely(error))
                goto out_uninit;
+        if (&ip->i_inode == sdp->sd_rindex) {
+                error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+                                           GL_NOCACHE, &m_ip->i_gh);
+                if (unlikely(error)) {
+                        gfs2_glock_dq(&ip->i_gh);
+                        goto out_uninit;
+                }
+        }
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
@@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                rblocks += data_blocks ? data_blocks : 1;
        if (ind_blocks || data_blocks)
                rblocks += RES_STATFS + RES_QUOTA;
+        if (&ip->i_inode == sdp->sd_rindex)
+                rblocks += 2 * RES_STATFS;
        error = gfs2_trans_begin(sdp, rblocks,
                                 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -712,6 +723,10 @@ out_alloc_put:
                gfs2_alloc_put(ip);
        }
 out_unlock:
+        if (&ip->i_inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
 out_uninit:
        gfs2_holder_uninit(&ip->i_gh);
@@ -725,14 +740,21 @@ out_uninit:
 static void adjust_fs_space(struct inode *inode)
 {
        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *m_bh, *l_bh;
        u64 fs_total, new_free;
        /* Total up the file system space, according to the latest rindex. */
        fs_total = gfs2_ri_total(sdp);
+        if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+                return;
        spin_lock(&sdp->sd_statfs_spin);
+        gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                              sizeof(struct gfs2_dinode));
        if (fs_total > (m_sc->sc_total + l_sc->sc_total))
                new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
        else
@@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
        fs_warn(sdp, "File system extended by %llu blocks.\n",
                (unsigned long long)new_free);
        gfs2_statfs_change(sdp, new_free, new_free, 0);
+        if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+                goto out;
+        update_statfs(sdp, m_bh, l_bh);
+        brelse(l_bh);
+out:
+        brelse(m_bh);
 }
 /**
@@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        u64 to = pos + copied;
        void *kaddr;
        unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        brelse(dibh);
        gfs2_trans_end(sdp);
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return copied;
@@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = page->mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
        unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
@@ -865,6 +900,10 @@ failed:
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
        }
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return ret;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 297421c0427a..8b674b1f3a55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(lru_lock);
@@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl)
 *
 */
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
 {
        GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
        atomic_inc(&gl->gl_ref);
 }
 /**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int demote_ok(const struct gfs2_glock *gl)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (gl->gl_state == LM_ST_UNLOCKED)
+                return 0;
+        if (!list_empty(&gl->gl_holders))
+                return 0;
+        if (glops->go_demote_ok)
+                return glops->go_demote_ok(gl);
+        return 1;
+}
+/**
 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
@@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
+        int may_reclaim;
+        may_reclaim = (demote_ok(gl) &&
+                       (atomic_read(&gl->gl_ref) == 1 ||
+                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                         atomic_read(&gl->gl_ref) <= 2)));
        spin_lock(&lru_lock);
-        if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+        if (list_empty(&gl->gl_lru) && may_reclaim) {
                list_add_tail(&gl->gl_lru, &lru_list);
                atomic_inc(&lru_count);
        }
@@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 }
 /**
+ * gfs2_glock_put_nolock() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ * This function should only be used if the caller has its own reference
+ * to the glock, in addition to the one it is dropping.
+ */
+void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+{
+        if (atomic_dec_and_test(&gl->gl_ref))
+                GLOCK_BUG_ON(gl, 1);
+        gfs2_glock_schedule_for_reclaim(gl);
+}
+/**
 * gfs2_glock_put() - Decrement reference count on glock
 * @gl: The glock to put
 *
@@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
                rv = 1;
                goto out;
        }
-        /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+        spin_lock(&gl->gl_spin);
-        if (atomic_read(&gl->gl_ref) == 2)
+        gfs2_glock_schedule_for_reclaim(gl);
-                gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
        write_unlock(gl_lock_addr(gl->gl_hash));
 out:
        return rv;
@@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
                if (held2)
                        gfs2_glock_hold(gl);
                else
-                        gfs2_glock_put(gl);
+                        gfs2_glock_put_nolock(gl);
        }
        gl->gl_state = new_state;
@@ -633,12 +674,35 @@ out:
 out_sched:
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                gfs2_glock_put(gl);
+                gfs2_glock_put_nolock(gl);
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
        goto out;
 }
+static void delete_work_func(struct work_struct *work)
+{
+        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = NULL;
+        struct inode *inode;
+        u64 no_addr = 0;
+        spin_lock(&gl->gl_spin);
+        ip = (struct gfs2_inode *)gl->gl_object;
+        if (ip)
+                no_addr = ip->i_no_addr;
+        spin_unlock(&gl->gl_spin);
+        if (ip) {
+                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+                if (inode) {
+                        d_prune_aliases(inode);
+                        iput(inode);
+                }
+        }
+        gfs2_glock_put(gl);
+}
 static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
@@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_sbd = sdp;
        gl->gl_aspace = NULL;
        INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+        INIT_WORK(&gl->gl_delete, delete_work_func);
        /* If this glock protects actual on-disk data or metadata blocks,
           create a VFS inode to manage the pages/buffers holding them. */
@@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
        }
+        if (gl->gl_ops->go_callback)
+                gl->gl_ops->go_callback(gl);
        trace_gfs2_demote_rq(gl);
 }
@@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
                gfs2_glock_put(gl);
 }
-/**
- * demote_ok - Check to see if it's ok to unlock a glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int demote_ok(const struct gfs2_glock *gl)
-{
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        if (gl->gl_state == LM_ST_UNLOCKED)
-                return 0;
-        if (!list_empty(&gl->gl_holders))
-                return 0;
-        if (glops->go_demote_ok)
-                return glops->go_demote_ok(gl);
-        return 1;
-}
 static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
        int may_demote;
        int nr_skipped = 0;
-        int got_ref = 0;
        LIST_HEAD(skipped);
        if (nr == 0)
@@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
+                /* Check if glock is about to be freed */
+                if (atomic_read(&gl->gl_ref) == 0)
+                        continue;
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        gfs2_glock_hold(gl);
-                        got_ref = 1;
                        spin_unlock(&lru_lock);
                        spin_lock(&gl->gl_spin);
                        may_demote = demote_ok(gl);
-                        spin_unlock(&gl->gl_spin);
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        if (may_demote) {
                                handle_callback(gl, LM_ST_UNLOCKED, 0);
                                nr--;
-                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                                        gfs2_glock_put(gl);
-                                got_ref = 0;
                        }
+                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                                gfs2_glock_put_nolock(gl);
+                        spin_unlock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        spin_lock(&lru_lock);
-                        if (may_demote)
+                        continue;
-                                continue;
-                }
-                if (list_empty(&gl->gl_lru) &&
-                    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
-                        nr_skipped++;
-                        list_add(&gl->gl_lru, &skipped);
-                }
-                if (got_ref) {
-                        spin_unlock(&lru_lock);
-                        gfs2_glock_put(gl);
-                        spin_lock(&lru_lock);
-                        got_ref = 0;
                }
+                nr_skipped++;
+                list_add(&gl->gl_lru, &skipped);
        }
        list_splice(&skipped, &lru_list);
        atomic_add(nr_skipped, &lru_count);
@@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void)
        glock_workqueue = create_workqueue("glock_workqueue");
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
+        gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+        if (IS_ERR(gfs2_delete_workqueue)) {
+                destroy_workqueue(glock_workqueue);
+                return PTR_ERR(gfs2_delete_workqueue);
+        }
        register_shrinker(&glock_shrinker);
@@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void)
 {
        unregister_shrinker(&glock_shrinker);
        destroy_workqueue(glock_workqueue);
+        destroy_workqueue(gfs2_delete_workqueue);
 }
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f08..c609894ec0d0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
 #define GLR_TRYFAILED           13
+extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
        struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put_nolock(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d5e4ab155ca0..6985eef06c39 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
@@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
        return 0;
 }
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+        if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+            gl->gl_state == LM_ST_SHARED &&
+            ip && test_bit(GIF_USER, &ip->i_flags)) {
+                gfs2_glock_hold(gl);
+                if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put_nolock(gl);
+        }
+}
 const struct gfs2_glock_operations gfs2_meta_glops = {
        .go_type = LM_TYPE_META,
 };
@@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
 const struct gfs2_glock_operations gfs2_iopen_glops = {
        .go_type = LM_TYPE_IOPEN,
+        .go_callback = iopen_go_callback,
 };
 const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 225347fbff3c..61801ada36f0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -159,6 +159,7 @@ struct gfs2_glock_operations {
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+        void (*go_callback) (struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
 };
@@ -228,6 +229,7 @@ struct gfs2_glock {
        struct list_head gl_ail_list;
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
+        struct work_struct gl_delete;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index daa4ae341a29..fba795798d3a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
        }
        tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-        if (count[1] + count[2] != tmp) {
+        if (count[1] != tmp) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used data mismatch:  %u != %u\n",
                               count[1], tmp);
                return;
        }
-        if (count[3] != rgd->rd_dinodes) {
+        if (count[2] + count[3] != rgd->rd_dinodes) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-                               count[3], rgd->rd_dinodes);
+                               count[2] + count[3], rgd->rd_dinodes);
                return;
        }
-        if (count[2] > count[3]) {
-                if (gfs2_consist_rgrpd(rgd))
-                        fs_err(sdp, "unlinked inodes > inodes:  %u\n",
-                               count[2]);
-                return;
-        }
 }
 static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
@@ -961,7 +953,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * Returns: The inode, if one has been found
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
+static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+                                     u64 skip)
 {
        struct inode *inode;
        u32 goal = 0, block;
@@ -985,6 +978,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
                goal++;
                if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
                        continue;
+                if (no_addr == skip)
+                        continue;
                *last_unlinked = no_addr;
                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
                                          no_addr, -1, 1);
@@ -1104,7 +1099,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
@@ -1138,7 +1133,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a6801336470..f522bb017973 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -353,7 +353,7 @@ fail:
        return error;
 }
-static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
        const struct gfs2_statfs_change *str = buf;
@@ -441,6 +441,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        brelse(l_bh);
 }
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                   struct buffer_head *l_bh)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        spin_lock(&sdp->sd_statfs_spin);
+        m_sc->sc_total += l_sc->sc_total;
+        m_sc->sc_free += l_sc->sc_free;
+        m_sc->sc_dinodes += l_sc->sc_dinodes;
+        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+               0, sizeof(struct gfs2_statfs_change));
+        spin_unlock(&sdp->sd_statfs_spin);
+        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
 int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -477,19 +500,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
        if (error)
                goto out_bh2;
-        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        update_statfs(sdp, m_bh, l_bh);
-        spin_lock(&sdp->sd_statfs_spin);
-        m_sc->sc_total += l_sc->sc_total;
-        m_sc->sc_free += l_sc->sc_free;
-        m_sc->sc_dinodes += l_sc->sc_dinodes;
-        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
-        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
-               0, sizeof(struct gfs2_statfs_change));
-        spin_unlock(&sdp->sd_statfs_spin);
-        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
        gfs2_trans_end(sdp);
@@ -680,6 +691,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        struct gfs2_holder t_gh;
        int error;
+        flush_workqueue(gfs2_delete_workqueue);
        gfs2_quota_sync(sdp);
        gfs2_statfs_sync(sdp);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40d..22e0417ed996 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -40,6 +40,10 @@ extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
                               s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+                                  const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                          struct buffer_head *l_bh);
 extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 98d6ef1c1dc0..148d55c14171 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -1,12 +1,11 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gfs2
 #if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_GFS2_H
 #include <linux/tracepoint.h>
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM gfs2
-#define TRACE_INCLUDE_FILE trace_gfs2
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/dlmconstants.h>
@@ -403,5 +402,6 @@ TRACE_EVENT(gfs2_block_alloc,
 /* This part must be outside protection */
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_gfs2
 #include <trace/define_trace.h>
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 6f833dc8e910..f7fcbe49da72 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
 #include <linux/nls.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include "hfs_fs.h"
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9fc3af0c0dab..c0759fe0855b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/nls.h>
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe02ad4740e7..032604e5ef2c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -972,6 +972,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* NULL is printed as <NULL> by sprintf: avoid that. */
        if (req_root == NULL)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 6916c41d7017..8865c94f55f6 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,6 +6,7 @@
 *  directory VFS functions
 */
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 64ab52259204..3efabff00367 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,6 +6,7 @@
 *  file VFS functions
 */
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 #define BLOCKS(size) (((size) + 511) >> 9)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index c2ea31bae313..701ca54c0867 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -13,7 +13,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include "hpfs.h"
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 39a1bfbea312..fe703ae46bc7 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,6 +6,7 @@
 *  inode VFS functions
 */
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index b649232dde97..82b9c4ba9ed0 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,6 +6,7 @@
 *  adding & removing files & directories
 */
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
diff --git a/fs/inode.c b/fs/inode.c
index a88baebf77cf..ae7b67e48661 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,7 @@
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
+#include <linux/posix_acl.h>
 /*
 * This is needed for the following functions:
@@ -119,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
-struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct address_space *const mapping = &inode->i_data;
        inode->i_sb = sb;
@@ -151,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->dirtied_when = 0;
        if (security_inode_alloc(inode))
-                goto out_free_inode;
+                goto out;
        /* allocate and initialize an i_integrity */
        if (ima_inode_alloc(inode))
@@ -189,21 +189,20 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        }
        inode->i_private = NULL;
        inode->i_mapping = mapping;
+#ifdef CONFIG_FS_POSIX_ACL
+        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+#endif
 #ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
 #endif
-        return inode;
+        return 0;
 out_free_security:
        security_inode_free(inode);
-out_free_inode:
+out:
-        if (inode->i_sb->s_op->destroy_inode)
+        return -ENOMEM;
-                inode->i_sb->s_op->destroy_inode(inode);
-        else
-                kmem_cache_free(inode_cachep, (inode));
-        return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
@@ -216,24 +215,43 @@ static struct inode *alloc_inode(struct super_block *sb)
        else
                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
-        if (inode)
+        if (!inode)
-                return inode_init_always(sb, inode);
+                return NULL;
-        return NULL;
+        if (unlikely(inode_init_always(sb, inode))) {
+                if (inode->i_sb->s_op->destroy_inode)
+                        inode->i_sb->s_op->destroy_inode(inode);
+                else
+                        kmem_cache_free(inode_cachep, inode);
+                return NULL;
+        }
+        return inode;
 }
-void destroy_inode(struct inode *inode)
+void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
        ima_inode_free(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
+#ifdef CONFIG_FS_POSIX_ACL
+        if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+                posix_acl_release(inode->i_acl);
+        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+                posix_acl_release(inode->i_default_acl);
+#endif
+}
+EXPORT_SYMBOL(__destroy_inode);
+void destroy_inode(struct inode *inode)
+{
+        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
                kmem_cache_free(inode_cachep, (inode));
 }
-EXPORT_SYMBOL(destroy_inode);
 /*
 * These are initializations that only need to be done
@@ -665,12 +683,17 @@ void unlock_new_inode(struct inode *inode)
        if (inode->i_mode & S_IFDIR) {
                struct file_system_type *type = inode->i_sb->s_type;
-                /*
+                /* Set new key only if filesystem hasn't already changed it */
-                 * ensure nobody is actually holding i_mutex
+                if (!lockdep_match_class(&inode->i_mutex,
-                 */
+                    &type->i_mutex_key)) {
-                mutex_destroy(&inode->i_mutex);
+                        /*
-                mutex_init(&inode->i_mutex);
+                         * ensure nobody is actually holding i_mutex
-                lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key);
+                         */
+                        mutex_destroy(&inode->i_mutex);
+                        mutex_init(&inode->i_mutex);
+                        lockdep_set_class(&inode->i_mutex,
+                                          &type->i_mutex_dir_key);
+                }
        }
 #endif
        /*
@@ -1408,7 +1431,7 @@ EXPORT_SYMBOL(touch_atime);
 *      for writeback.  Note that this function is meant exclusively for
 *      usage in the file write path of filesystems, and filesystems may
 *      choose to explicitly ignore update via this function with the
- *      S_NOCTIME inode flag, e.g. for network filesystem where these
+ *      S_NOCMTIME inode flag, e.g. for network filesystem where these
 *      timestamps are handled by the server.
 */
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 286f38dfc6c0..5612880fcbe7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
+#include <linux/falloc.h>
 #include <asm/ioctls.h>
@@ -70,9 +71,7 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
        res = get_user(block, p);
        if (res)
                return res;
-        lock_kernel();
        res = mapping->a_ops->bmap(mapping, block);
-        unlock_kernel();
        return put_user(res, p);
 }
@@ -405,6 +404,37 @@ EXPORT_SYMBOL(generic_block_fiemap);
 #endif  /*  CONFIG_BLOCK  */
+/*
+ * This provides compatibility with legacy XFS pre-allocation ioctls
+ * which predate the fallocate syscall.
+ *
+ * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
+ * are used here, rest are ignored.
+ */
+int ioctl_preallocate(struct file *filp, void __user *argp)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct space_resv sr;
+        if (copy_from_user(&sr, argp, sizeof(sr)))
+                return -EFAULT;
+        switch (sr.l_whence) {
+        case SEEK_SET:
+                break;
+        case SEEK_CUR:
+                sr.l_start += filp->f_pos;
+                break;
+        case SEEK_END:
+                sr.l_start += i_size_read(inode);
+                break;
+        default:
+                return -EINVAL;
+        }
+        return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
 static int file_ioctl(struct file *filp, unsigned int cmd,
                unsigned long arg)
 {
@@ -416,6 +446,9 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
                return ioctl_fibmap(filp, p);
        case FIONREAD:
                return put_user(i_size_read(inode) - filp->f_pos, p);
+        case FS_IOC_RESVSP:
+        case FS_IOC_RESVSP64:
+                return ioctl_preallocate(filp, p);
        }
        return vfs_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 2f0dc5a14633..8ba5441063be 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -195,9 +195,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                 * Do not report hidden files if so instructed, or associated
                 * files unless instructed to do so
                 */
-                if ((sbi->s_hide == 'y' &&
+                if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
-                                (de->flags[-sbi->s_high_sierra] & 1)) ||
+                    (!sbi->s_showassoc &&
-                                (sbi->s_showassoc =='n' &&
                                (de->flags[-sbi->s_high_sierra] & 4))) {
                        filp->f_pos += de_len;
                        continue;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 068b34b5a107..85f96bc651c7 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -141,13 +141,17 @@ static const struct dentry_operations isofs_dentry_ops[] = {
 };
 struct iso9660_options{
-        char map;
+        unsigned int rock:1;
-        char rock;
+        unsigned int joliet:1;
-        char joliet;
+        unsigned int cruft:1;
-        char cruft;
+        unsigned int hide:1;
-        char hide;
+        unsigned int showassoc:1;
-        char showassoc;
+        unsigned int nocompress:1;
-        char nocompress;
+        unsigned int overriderockperm:1;
+        unsigned int uid_set:1;
+        unsigned int gid_set:1;
+        unsigned int utf8:1;
+        unsigned char map;
        unsigned char check;
        unsigned int blocksize;
        mode_t fmode;
@@ -155,7 +159,6 @@ struct iso9660_options{
        gid_t gid;
        uid_t uid;
        char *iocharset;
-        unsigned char utf8;
        /* LVE */
        s32 session;
        s32 sbsector;
@@ -312,7 +315,7 @@ enum {
        Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
        Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
        Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
-        Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
+        Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
 };
 static const match_table_t tokens = {
@@ -340,6 +343,7 @@ static const match_table_t tokens = {
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%u"},
        {Opt_dmode, "dmode=%u"},
+        {Opt_overriderockperm, "overriderockperm"},
        {Opt_block, "block=%u"},
        {Opt_ignore, "conv=binary"},
        {Opt_ignore, "conv=b"},
@@ -359,24 +363,22 @@ static int parse_options(char *options, struct iso9660_options *popt)
        int option;
        popt->map = 'n';
-        popt->rock = 'y';
+        popt->rock = 1;
-        popt->joliet = 'y';
+        popt->joliet = 1;
-        popt->cruft = 'n';
+        popt->cruft = 0;
-        popt->hide = 'n';
+        popt->hide = 0;
-        popt->showassoc = 'n';
+        popt->showassoc = 0;
        popt->check = 'u';              /* unset */
        popt->nocompress = 0;
        popt->blocksize = 1024;
-        popt->fmode = popt->dmode = S_IRUGO | S_IXUGO; /*
+        popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
-                                         * r-x for all.  The disc could
+        popt->uid_set = 0;
-                                         * be shared with DOS machines so
+        popt->gid_set = 0;
-                                         * virtually anything could be
-                                         * a valid executable.
-                                         */
        popt->gid = 0;
        popt->uid = 0;
        popt->iocharset = NULL;
        popt->utf8 = 0;
+        popt->overriderockperm = 0;
        popt->session=-1;
        popt->sbsector=-1;
        if (!options)
@@ -393,20 +395,20 @@ static int parse_options(char *options, struct iso9660_options *popt)
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_norock:
-                        popt->rock = 'n';
+                        popt->rock = 0;
                        break;
                case Opt_nojoliet:
-                        popt->joliet = 'n';
+                        popt->joliet = 0;
                        break;
                case Opt_hide:
-                        popt->hide = 'y';
+                        popt->hide = 1;
                        break;
                case Opt_unhide:
                case Opt_showassoc:
-                        popt->showassoc = 'y';
+                        popt->showassoc = 1;
                        break;
                case Opt_cruft:
-                        popt->cruft = 'y';
+                        popt->cruft = 1;
                        break;
                case Opt_utf8:
                        popt->utf8 = 1;
@@ -450,11 +452,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->uid = option;
+                        popt->uid_set = 1;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->gid = option;
+                        popt->gid_set = 1;
                        break;
                case Opt_mode:
                        if (match_int(&args[0], &option))
@@ -466,6 +470,9 @@ static int parse_options(char *options, struct iso9660_options *popt)
                                return 0;
                        popt->dmode = option;
                        break;
+                case Opt_overriderockperm:
+                        popt->overriderockperm = 1;
+                        break;
                case Opt_block:
                        if (match_int(&args[0], &option))
                                return 0;
@@ -625,7 +632,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
                        else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
                                sec = (struct iso_supplementary_descriptor *)vdp;
                                if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
-                                        if (opt.joliet == 'y') {
+                                        if (opt.joliet) {
                                                if (sec->escape[2] == 0x40)
                                                        joliet_level = 1;
                                                else if (sec->escape[2] == 0x43)
@@ -650,7 +657,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
                                        goto out_freebh;
                                sbi->s_high_sierra = 1;
-                                opt.rock = 'n';
+                                opt.rock = 0;
                                h_pri = (struct hs_primary_descriptor *)vdp;
                                goto root_found;
                        }
@@ -673,7 +680,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 root_found:
-        if (joliet_level && (pri == NULL || opt.rock == 'n')) {
+        if (joliet_level && (pri == NULL || !opt.rock)) {
                /* This is the case of Joliet with the norock mount flag.
                 * A disc with both Joliet and Rock Ridge is handled later
                 */
@@ -802,22 +809,31 @@ root_found:
        s->s_op = &isofs_sops;
        s->s_export_op = &isofs_export_ops;
        sbi->s_mapping = opt.map;
-        sbi->s_rock = (opt.rock == 'y' ? 2 : 0);
+        sbi->s_rock = (opt.rock ? 2 : 0);
        sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
        sbi->s_cruft = opt.cruft;
        sbi->s_hide = opt.hide;
        sbi->s_showassoc = opt.showassoc;
        sbi->s_uid = opt.uid;
        sbi->s_gid = opt.gid;
+        sbi->s_uid_set = opt.uid_set;
+        sbi->s_gid_set = opt.gid_set;
        sbi->s_utf8 = opt.utf8;
        sbi->s_nocompress = opt.nocompress;
+        sbi->s_overriderockperm = opt.overriderockperm;
        /*
         * It would be incredibly stupid to allow people to mark every file
         * on the disk as suid, so we merely allow them to set the default
         * permissions.
         */
-        sbi->s_fmode = opt.fmode & 0777;
+        if (opt.fmode != ISOFS_INVALID_MODE)
-        sbi->s_dmode = opt.dmode & 0777;
+                sbi->s_fmode = opt.fmode & 0777;
+        else
+                sbi->s_fmode = ISOFS_INVALID_MODE;
+        if (opt.dmode != ISOFS_INVALID_MODE)
+                sbi->s_dmode = opt.dmode & 0777;
+        else
+                sbi->s_dmode = ISOFS_INVALID_MODE;
        /*
         * Read the root inode, which _may_ result in changing
@@ -1095,18 +1111,6 @@ static const struct address_space_operations isofs_aops = {
        .bmap = _isofs_bmap
 };
-static inline void test_and_set_uid(uid_t *p, uid_t value)
-{
-        if (value)
-                *p = value;
-}
-static inline void test_and_set_gid(gid_t *p, gid_t value)
-{
-        if (value)
-                *p = value;
-}
 static int isofs_read_level3_size(struct inode *inode)
 {
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -1261,7 +1265,10 @@ static int isofs_read_inode(struct inode *inode)
        ei->i_file_format = isofs_file_normal;
        if (de->flags[-high_sierra] & 2) {
-                inode->i_mode = sbi->s_dmode | S_IFDIR;
+                if (sbi->s_dmode != ISOFS_INVALID_MODE)
+                        inode->i_mode = S_IFDIR | sbi->s_dmode;
+                else
+                        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
                inode->i_nlink = 1;     /*
                                         * Set to 1.  We know there are 2, but
                                         * the find utility tries to optimize
@@ -1270,8 +1277,16 @@ static int isofs_read_inode(struct inode *inode)
                                         * do it the hard way.
                                         */
        } else {
-                /* Everybody gets to read the file. */
+                if (sbi->s_fmode != ISOFS_INVALID_MODE) {
-                inode->i_mode = sbi->s_fmode | S_IFREG;
+                        inode->i_mode = S_IFREG | sbi->s_fmode;
+                } else {
+                        /*
+                         * Set default permissions: r-x for all.  The disc
+                         * could be shared with DOS machines so virtually
+                         * anything could be a valid executable.
+                         */
+                        inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
+                }
                inode->i_nlink = 1;
        }
        inode->i_uid = sbi->s_uid;
@@ -1300,7 +1315,7 @@ static int isofs_read_inode(struct inode *inode)
         * this CDROM was mounted with the cruft option.
         */
-        if (sbi->s_cruft == 'y')
+        if (sbi->s_cruft)
                inode->i_size &= 0x00ffffff;
        if (de->interleave[0]) {
@@ -1346,9 +1361,18 @@ static int isofs_read_inode(struct inode *inode)
        if (!high_sierra) {
                parse_rock_ridge_inode(de, inode);
                /* if we want uid/gid set, override the rock ridge setting */
-                test_and_set_uid(&inode->i_uid, sbi->s_uid);
+                if (sbi->s_uid_set)
-                test_and_set_gid(&inode->i_gid, sbi->s_gid);
+                        inode->i_uid = sbi->s_uid;
+                if (sbi->s_gid_set)
+                        inode->i_gid = sbi->s_gid;
        }
+        /* Now set final access rights if overriding rock ridge setting */
+        if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
+            sbi->s_dmode != ISOFS_INVALID_MODE)
+                inode->i_mode = S_IFDIR | sbi->s_dmode;
+        if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
+            sbi->s_fmode != ISOFS_INVALID_MODE)
+                inode->i_mode = S_IFREG | sbi->s_fmode;
        /* Install the inode operations vector */
        if (S_ISREG(inode->i_mode)) {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index ccbf72faf27a..7d33de84f52a 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -35,21 +35,20 @@ struct isofs_sb_info {
        unsigned long s_log_zone_size;
        unsigned long s_max_size;
        
-        unsigned char s_high_sierra; /* A simple flag */
-        unsigned char s_mapping;
        int           s_rock_offset; /* offset of SUSP fields within SU area */
-        unsigned char s_rock;
        unsigned char s_joliet_level;
-        unsigned char s_utf8;
+        unsigned char s_mapping;
-        unsigned char s_cruft; /* Broken disks with high
+        unsigned int  s_high_sierra:1;
-                                  byte of length containing
+        unsigned int  s_rock:2;
-                                  junk */
+        unsigned int  s_utf8:1;
-        unsigned char s_unhide;
+        unsigned int  s_cruft:1; /* Broken disks with high byte of length
-        unsigned char s_nosuid;
+                                  * containing junk */
-        unsigned char s_nodev;
+        unsigned int  s_nocompress:1;
-        unsigned char s_nocompress;
+        unsigned int  s_hide:1;
-        unsigned char s_hide;
+        unsigned int  s_showassoc:1;
-        unsigned char s_showassoc;
+        unsigned int  s_overriderockperm:1;
+        unsigned int  s_uid_set:1;
+        unsigned int  s_gid_set:1;
        mode_t s_fmode;
        mode_t s_dmode;
@@ -58,6 +57,8 @@ struct isofs_sb_info {
        struct nls_table *s_nls_iocharset; /* Native language support table */
 };
+#define ISOFS_INVALID_MODE ((mode_t) -1)
 static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 92c14b850e9c..a048de81c093 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
        return (op - ascii);
 }
-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
-        const __u8 *ip;
-        __u8 *op;
-        int size;
-        __u16 c;
-        op = s;
-        ip = pwcs;
-        while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
-                c = (*ip << 8) | ip[1];
-                if (c > 0x7f) {
-                        size = utf8_wctomb(op, c, maxlen);
-                        if (size == -1) {
-                                /* Ignore character and move on */
-                                maxlen--;
-                        } else {
-                                op += size;
-                                maxlen -= size;
-                        }
-                } else {
-                        *op++ = (__u8) c;
-                }
-                ip += 2;
-                inlen--;
-        }
-        return (op - s);
-}
 int
 get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
 {
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
        nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
        if (utf8) {
-                len = wcsntombs_be(outname, de->name,
+                len = utf16s_to_utf8s((const wchar_t *) de->name,
-                                de->name_len[0] >> 1, PAGE_SIZE);
+                                de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+                                outname, PAGE_SIZE);
        } else {
                len = uni16_to_x8(outname, (__be16 *) de->name,
                                de->name_len[0] >> 1, nls);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 8299889a835e..eaa831311c9c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -142,9 +142,9 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
                 */
                match = 0;
                if (dlen > 0 &&
-                        (sbi->s_hide =='n' ||
+                        (!sbi->s_hide ||
                                (!(de->flags[-sbi->s_high_sierra] & 1))) &&
-                        (sbi->s_showassoc =='y' ||
+                        (sbi->s_showassoc ||
                                (!(de->flags[-sbi->s_high_sierra] & 4)))) {
                        match = (isofs_cmp(dentry, dpnt, dlen) == 0);
                }
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..f96f85092d1c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal)
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
+        if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+                printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+                       first, last);
+                journal_fail_superblock(journal);
+                return -EINVAL;
+        }
        journal->j_first = first;
        journal->j_last = last;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ed886e6db399..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
-                struct buffer_head *bh = jh2bh(jh);
-                if (test_clear_buffer_dirty(bh))
+        printk(KERN_WARNING
-                        set_buffer_jbddirty(bh);
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-        }
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 /*
@@ -583,14 +564,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1686,35 +1678,6 @@ out:
        return;
 }
-/*
- * journal_try_to_free_buffers() could race with journal_commit_transaction()
- * The latter might still hold the a count on buffers when inspecting
- * them on t_syncdata_list or t_locked_list.
- *
- * journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * tryinf to free that buffer.
- *
- * Called with journal->j_state_lock held.
- */
-static void journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-        transaction_t *transaction = NULL;
-        tid_t tid;
-        spin_lock(&journal->j_state_lock);
-        transaction = journal->j_committing_transaction;
-        if (!transaction) {
-                spin_unlock(&journal->j_state_lock);
-                return;
-        }
-        tid = transaction->t_tid;
-        spin_unlock(&journal->j_state_lock);
-        log_wait_commit(journal, tid);
-}
 /**
 * int journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
@@ -1786,25 +1749,6 @@ int journal_try_to_free_buffers(journal_t *journal,
        ret = try_to_free_buffers(page);
-        /*
-         * There are a number of places where journal_try_to_free_buffers()
-         * could race with journal_commit_transaction(), the later still
-         * holds the reference to the buffers to free while processing them.
-         * try_to_free_buffers() failed to free those buffers. Some of the
-         * caller of releasepage() request page buffers to be dropped, otherwise
-         * treat the fail-to-free as errors (such as generic_file_direct_IO())
-         *
-         * So, if the caller of try_to_release_page() wants the synchronous
-         * behaviour(i.e make sure buffers are dropped upon return),
-         * let's wait for the current transaction to finish flush of
-         * dirty data buffers, then try to free those buffers again,
-         * with the journal locked.
-         */
-        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-                journal_wait_for_transaction_sync_data(journal);
-                ret = try_to_free_buffers(page);
-        }
 busy:
        return ret;
 }
@@ -1830,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -2089,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 17159cacbd9e..5d70b3e6d49b 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,9 +20,9 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <trace/events/jbd2.h>
 /*
 * Unlink a buffer from a transaction checkpoint list.
@@ -358,8 +358,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
         * journal straight away.
         */
        result = jbd2_cleanup_journal_tail(journal);
-        trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+        trace_jbd2_checkpoint(journal, result);
-                   journal->j_devname, result);
        jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
        if (result <= 0)
                return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0b7d3b8226fd..7b4088b2364d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -26,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <trace/events/jbd2.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -253,6 +253,7 @@ static int journal_submit_data_buffers(journal_t *journal,
                 * block allocation  with delalloc. We need to write
                 * only allocated blocks here.
                 */
+                trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
                err = journal_submit_inode_data_buffers(mapping);
                if (!ret)
                        ret = err;
@@ -394,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction = journal->j_running_transaction;
        J_ASSERT(commit_transaction->t_state == T_RUNNING);
-        trace_mark(jbd2_start_commit, "dev %s transaction %d",
+        trace_jbd2_start_commit(journal, commit_transaction);
-                   journal->j_devname, commit_transaction->t_tid);
        jbd_debug(1, "JBD: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
@@ -409,6 +409,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        if (commit_transaction->t_synchronous_commit)
                write_op = WRITE_SYNC_PLUG;
+        trace_jbd2_commit_locking(journal, commit_transaction);
        stats.u.run.rs_wait = commit_transaction->t_max_wait;
        stats.u.run.rs_locked = jiffies;
        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -484,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        jbd2_journal_switch_revoke_table(journal);
+        trace_jbd2_commit_flushing(journal, commit_transaction);
        stats.u.run.rs_flushing = jiffies;
        stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
                                               stats.u.run.rs_flushing);
@@ -520,6 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_COMMIT;
        spin_unlock(&journal->j_state_lock);
+        trace_jbd2_commit_logging(journal, commit_transaction);
        stats.u.run.rs_logging = jiffies;
        stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
                                                 stats.u.run.rs_logging);
@@ -1054,9 +1057,7 @@ restart_loop:
        if (journal->j_commit_callback)
                journal->j_commit_callback(journal, commit_transaction);
-        trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+        trace_jbd2_end_commit(journal, commit_transaction);
-                   journal->j_devname, commit_transaction->t_tid,
-                   journal->j_tail_sequence);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
        if (to_free)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 62be7d294ec2..e378cb383979 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -38,6 +38,10 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/math64.h>
+#include <linux/hash.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd2.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -293,6 +297,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
        struct jbd2_buffer_trigger_type *triggers;
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -306,6 +311,11 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -384,14 +394,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -408,7 +410,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -2377,6 +2383,72 @@ static void __exit journal_exit(void)
        jbd2_journal_destroy_caches();
 }
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ *
+ * The caller should use rcu_read_lock() in order to make sure the
+ * device name stays valid until its done with it.  We use
+ * rcu_read_lock() as well to make sure we're safe in case the caller
+ * gets sloppy, and because rcu_read_lock() is cheap and can be safely
+ * nested.
+ */
+struct devname_cache {
+        struct rcu_head rcu;
+        dev_t           device;
+        char            devname[BDEVNAME_SIZE];
+};
+#define CACHE_SIZE_BITS 6
+static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
+static DEFINE_SPINLOCK(devname_cache_lock);
+static void free_devcache(struct rcu_head *rcu)
+{
+        kfree(rcu);
+}
+const char *jbd2_dev_to_name(dev_t device)
+{
+        int     i = hash_32(device, CACHE_SIZE_BITS);
+        char    *ret;
+        struct block_device *bd;
+        static struct devname_cache *new_dev;
+        rcu_read_lock();
+        if (devcache[i] && devcache[i]->device == device) {
+                ret = devcache[i]->devname;
+                rcu_read_unlock();
+                return ret;
+        }
+        rcu_read_unlock();
+        new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+        if (!new_dev)
+                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+        spin_lock(&devname_cache_lock);
+        if (devcache[i]) {
+                if (devcache[i]->device == device) {
+                        kfree(new_dev);
+                        ret = devcache[i]->devname;
+                        spin_unlock(&devname_cache_lock);
+                        return ret;
+                }
+                call_rcu(&devcache[i]->rcu, free_devcache);
+        }
+        devcache[i] = new_dev;
+        devcache[i]->device = device;
+        bd = bdget(device);
+        if (bd) {
+                bdevname(bd, devcache[i]->devname);
+                bdput(bd);
+        } else
+                __bdevname(device, devcache[i]->devname);
+        ret = devcache[i]->devname;
+        spin_unlock(&devname_cache_lock);
+        return ret;
+}
+EXPORT_SYMBOL(jbd2_dev_to_name);
 MODULE_LICENSE("GPL");
 module_init(journal_init);
 module_exit(journal_exit);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 996ffda06bf3..6213ac728f30 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
-                struct buffer_head *bh = jh2bh(jh);
-                if (test_clear_buffer_dirty(bh))
+        printk(KERN_WARNING
-                        set_buffer_jbddirty(bh);
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-        }
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 /*
@@ -593,14 +574,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous jbd2_journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1547,36 +1539,6 @@ out:
        return;
 }
-/*
- * jbd2_journal_try_to_free_buffers() could race with
- * jbd2_journal_commit_transaction(). The later might still hold the
- * reference count to the buffers when inspecting them on
- * t_syncdata_list or t_locked_list.
- *
- * jbd2_journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * try to free that buffer.
- *
- * Called with journal->j_state_lock hold.
- */
-static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-        transaction_t *transaction;
-        tid_t tid;
-        spin_lock(&journal->j_state_lock);
-        transaction = journal->j_committing_transaction;
-        if (!transaction) {
-                spin_unlock(&journal->j_state_lock);
-                return;
-        }
-        tid = transaction->t_tid;
-        spin_unlock(&journal->j_state_lock);
-        jbd2_log_wait_commit(journal, tid);
-}
 /**
 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
@@ -1649,25 +1611,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
        ret = try_to_free_buffers(page);
-        /*
-         * There are a number of places where jbd2_journal_try_to_free_buffers()
-         * could race with jbd2_journal_commit_transaction(), the later still
-         * holds the reference to the buffers to free while processing them.
-         * try_to_free_buffers() failed to free those buffers. Some of the
-         * caller of releasepage() request page buffers to be dropped, otherwise
-         * treat the fail-to-free as errors (such as generic_file_direct_IO())
-         *
-         * So, if the caller of try_to_release_page() wants the synchronous
-         * behaviour(i.e make sure buffers are dropped upon return),
-         * let's wait for the current transaction to finish flush of
-         * dirty data buffers, then try to free those buffers again,
-         * with the journal locked.
-         */
-        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-                jbd2_journal_wait_for_transaction_sync_data(journal);
-                ret = try_to_free_buffers(page);
-        }
 busy:
        return ret;
 }
@@ -1693,8 +1636,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -1945,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 043740dde20c..8fcb6239218e 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -156,48 +156,25 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
        return ERR_PTR(-EINVAL);
 }
-static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = JFFS2_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != JFFS2_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != JFFS2_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        struct posix_acl *acl;
        char *value = NULL;
        int rc, xprefix;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
-                acl = jffs2_iget_acl(inode, &f->i_acl_access);
-                if (acl != JFFS2_ACL_NOT_CACHED)
-                        return acl;
                xprefix = JFFS2_XPREFIX_ACL_ACCESS;
                break;
        case ACL_TYPE_DEFAULT:
-                acl = jffs2_iget_acl(inode, &f->i_acl_default);
-                if (acl != JFFS2_ACL_NOT_CACHED)
-                        return acl;
                xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
        rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
        if (rc > 0) {
@@ -215,16 +192,8 @@ static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
        }
        if (value)
                kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch (type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        jffs2_iset_acl(inode, &f->i_acl_access, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
-                        break;
-                }
-        }
        return acl;
 }
@@ -249,7 +218,6 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
 static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        int rc, xprefix;
        if (S_ISLNK(inode->i_mode))
@@ -285,16 +253,8 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                return -EINVAL;
        }
        rc = __jffs2_set_acl(inode, xprefix, acl);
-        if (!rc) {
+        if (!rc)
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        jffs2_iset_acl(inode, &f->i_acl_access, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
-                        break;
-                }
-        }
        return rc;
 }
@@ -321,12 +281,10 @@ int jffs2_permission(struct inode *inode, int mask)
 int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        struct posix_acl *acl, *clone;
        int rc;
-        f->i_acl_default = NULL;
+        cache_no_acl(inode);
-        f->i_acl_access = NULL;
        if (S_ISLNK(*i_mode))
                return 0;       /* Symlink always has no-ACL */
@@ -339,7 +297,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
                *i_mode &= ~current_umask();
        } else {
                if (S_ISDIR(*i_mode))
-                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
+                        set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
                clone = posix_acl_clone(acl, GFP_KERNEL);
                if (!clone)
@@ -350,7 +308,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
                        return rc;
                }
                if (rc > 0)
-                        jffs2_iset_acl(inode, &f->i_acl_access, clone);
+                        set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
                posix_acl_release(clone);
        }
@@ -359,17 +317,16 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 int jffs2_init_acl_post(struct inode *inode)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        int rc;
-        if (f->i_acl_default) {
+        if (inode->i_default_acl) {
-                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, f->i_acl_default);
+                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, inode->i_default_acl);
                if (rc)
                        return rc;
        }
-        if (f->i_acl_access) {
+        if (inode->i_acl) {
-                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, f->i_acl_access);
+                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, inode->i_acl);
                if (rc)
                        return rc;
        }
@@ -377,18 +334,6 @@ int jffs2_init_acl_post(struct inode *inode)
        return 0;
 }
-void jffs2_clear_acl(struct jffs2_inode_info *f)
-{
-        if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
-                posix_acl_release(f->i_acl_access);
-                f->i_acl_access = JFFS2_ACL_NOT_CACHED;
-        }
-        if (f->i_acl_default && f->i_acl_default != JFFS2_ACL_NOT_CACHED) {
-                posix_acl_release(f->i_acl_default);
-                f->i_acl_default = JFFS2_ACL_NOT_CACHED;
-        }
-}
 int jffs2_acl_chmod(struct inode *inode)
 {
        struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 8ca058aed384..fc929f2a14f6 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,13 +26,10 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-#define JFFS2_ACL_NOT_CACHED ((void *)-1)
 extern int jffs2_permission(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
-extern void jffs2_clear_acl(struct jffs2_inode_info *);
 extern struct xattr_handler jffs2_acl_access_xattr_handler;
 extern struct xattr_handler jffs2_acl_default_xattr_handler;
@@ -43,6 +40,5 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
 #define jffs2_acl_chmod(inode)                  (0)
 #define jffs2_init_acl_pre(dir_i,inode,mode)    (0)
 #define jffs2_init_acl_post(inode)              (0)
-#define jffs2_clear_acl(f)
 #endif  /* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index a0244740b75a..b47679be118a 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -270,19 +270,21 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
        D2({
                int i=0;
                struct jffs2_raw_node_ref *this;
-                printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n" KERN_DEBUG);
+                printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n");
                this = ic->nodes;
+                printk(KERN_DEBUG);
                while(this) {
-                        printk( "0x%08x(%d)->", ref_offset(this), ref_flags(this));
+                        printk(KERN_CONT "0x%08x(%d)->",
+                               ref_offset(this), ref_flags(this));
                        if (++i == 5) {
-                                printk("\n" KERN_DEBUG);
+                                printk(KERN_DEBUG);
                                i=0;
                        }
                        this = this->next_in_ino;
                }
-                printk("\n");
+                printk(KERN_CONT "\n");
        });
        switch (ic->class) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf20581..23c947539864 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
        kunmap(pg);
        D2(printk(KERN_DEBUG "readpage finished\n"));
-        return 0;
+        return ret;
 }
 int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 4c41db91eaa4..c6923da98263 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -50,10 +50,6 @@ struct jffs2_inode_info {
        uint16_t flags;
        uint8_t usercompr;
        struct inode vfs_inode;
-#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-        struct posix_acl *i_acl_access;
-        struct posix_acl *i_acl_default;
-#endif
 };
 #endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 2228380c47b9..a7f03b7ebcb3 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -56,10 +56,6 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
        f->target = NULL;
        f->flags = 0;
        f->usercompr = 0;
-#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-        f->i_acl_access = JFFS2_ACL_NOT_CACHED;
-        f->i_acl_default = JFFS2_ACL_NOT_CACHED;
-#endif
 }
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1fc1e92356ee..1a80301004b8 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1424,7 +1424,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
        struct jffs2_full_dirent *fd, *fds;
        int deleted;
-        jffs2_clear_acl(f);
        jffs2_xattr_delete_inode(c, f->inocache);
        mutex_lock(&f->sem);
        deleted = f->inocache && !f->inocache->pino_nlink;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 1d437de1e9a8..696686cc206e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -130,9 +130,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
        if (jffs2_sum_active()) {
                s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
                if (!s) {
-                        kfree(flashbuf);
                        JFFS2_WARNING("Can't allocate memory for summary\n");
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        goto out;
                }
        }
@@ -196,7 +196,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                                if (c->nextblock) {
                                        ret = file_dirty(c, c->nextblock);
                                        if (ret)
-                                                return ret;
+                                                goto out;
                                        /* deleting summary information of the old nextblock */
                                        jffs2_sum_reset_collected(c->summary);
                                }
@@ -207,7 +207,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                        } else {
                                ret = file_dirty(c, jeb);
                                if (ret)
-                                        return ret;
+                                        goto out;
                        }
                        break;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 07a22caf2687..0035c021395a 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/fs.h>
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 06ca1b8d2054..a29c7c3e3fb8 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -31,27 +31,24 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 {
        struct posix_acl *acl;
        char *ea_name;
-        struct jfs_inode_info *ji = JFS_IP(inode);
-        struct posix_acl **p_acl;
        int size;
        char *value = NULL;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch(type) {
                case ACL_TYPE_ACCESS:
                        ea_name = POSIX_ACL_XATTR_ACCESS;
-                        p_acl = &ji->i_acl;
                        break;
                case ACL_TYPE_DEFAULT:
                        ea_name = POSIX_ACL_XATTR_DEFAULT;
-                        p_acl = &ji->i_default_acl;
                        break;
                default:
                        return ERR_PTR(-EINVAL);
        }
-        if (*p_acl != JFS_ACL_NOT_CACHED)
-                return posix_acl_dup(*p_acl);
        size = __jfs_getxattr(inode, ea_name, NULL, 0);
        if (size > 0) {
@@ -62,17 +59,16 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
        }
        if (size < 0) {
-                if (size == -ENODATA) {
+                if (size == -ENODATA)
-                        *p_acl = NULL;
                        acl = NULL;
-                } else
+                else
                        acl = ERR_PTR(size);
        } else {
                acl = posix_acl_from_xattr(value, size);
-                if (!IS_ERR(acl))
-                        *p_acl = posix_acl_dup(acl);
        }
        kfree(value);
+        if (!IS_ERR(acl))
+                set_cached_acl(inode, type, acl);
        return acl;
 }
@@ -80,8 +76,6 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
                       struct posix_acl *acl)
 {
        char *ea_name;
-        struct jfs_inode_info *ji = JFS_IP(inode);
-        struct posix_acl **p_acl;
        int rc;
        int size = 0;
        char *value = NULL;
@@ -92,11 +86,9 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
        switch(type) {
                case ACL_TYPE_ACCESS:
                        ea_name = POSIX_ACL_XATTR_ACCESS;
-                        p_acl = &ji->i_acl;
                        break;
                case ACL_TYPE_DEFAULT:
                        ea_name = POSIX_ACL_XATTR_DEFAULT;
-                        p_acl = &ji->i_default_acl;
                        if (!S_ISDIR(inode->i_mode))
                                return acl ? -EACCES : 0;
                        break;
@@ -116,27 +108,24 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 out:
        kfree(value);
-        if (!rc) {
+        if (!rc)
-                if (*p_acl && (*p_acl != JFS_ACL_NOT_CACHED))
+                set_cached_acl(inode, type, acl);
-                        posix_acl_release(*p_acl);
-                *p_acl = posix_acl_dup(acl);
-        }
        return rc;
 }
 static int jfs_check_acl(struct inode *inode, int mask)
 {
-        struct jfs_inode_info *ji = JFS_IP(inode);
+        struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (ji->i_acl == JFS_ACL_NOT_CACHED) {
+        if (IS_ERR(acl))
-                struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+                return PTR_ERR(acl);
-                if (IS_ERR(acl))
+        if (acl) {
-                        return PTR_ERR(acl);
+                int error = posix_acl_permission(inode, acl, mask);
                posix_acl_release(acl);
+                return error;
        }
-        if (ji->i_acl)
-                return posix_acl_permission(inode, ji->i_acl, mask);
        return -EAGAIN;
 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index bbbd5f202e37..41d6045dbeb0 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -391,6 +391,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
                }
                XADaddress(xp, xaddr);
                XADlength(xp, xlen);
+                XADoffset(xp, prev);
                /*
                 * only preserve the abnr flag within the xad flags
                 * of the returned hint.
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 439901d205fe..1439f119ec83 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -74,10 +74,6 @@ struct jfs_inode_info {
        /* xattr_sem allows us to access the xattrs without taking i_mutex */
        struct rw_semaphore xattr_sem;
        lid_t   xtlid;          /* lid of xtree lock on directory */
-#ifdef CONFIG_JFS_POSIX_ACL
-        struct posix_acl *i_acl;
-        struct posix_acl *i_default_acl;
-#endif
        union {
                struct {
                        xtpage_t _xtroot;       /* 288: xtree root */
@@ -107,8 +103,6 @@ struct jfs_inode_info {
 #define i_inline u.link._inline
 #define i_inline_ea u.link._inline_ea
-#define JFS_ACL_NOT_CACHED ((void *)-1)
 #define IREAD_LOCK(ip, subclass) \
        down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IREAD_UNLOCK(ip)        up_read(&JFS_IP(ip)->rdwrlock)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 09b1b6ee2186..37e6dcda8fc8 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -128,18 +128,6 @@ static void jfs_destroy_inode(struct inode *inode)
                ji->active_ag = -1;
        }
        spin_unlock_irq(&ji->ag_lock);
-#ifdef CONFIG_JFS_POSIX_ACL
-        if (ji->i_acl != JFS_ACL_NOT_CACHED) {
-                posix_acl_release(ji->i_acl);
-                ji->i_acl = JFS_ACL_NOT_CACHED;
-        }
-        if (ji->i_default_acl != JFS_ACL_NOT_CACHED) {
-                posix_acl_release(ji->i_default_acl);
-                ji->i_default_acl = JFS_ACL_NOT_CACHED;
-        }
-#endif
        kmem_cache_free(jfs_inode_cachep, ji);
 }
@@ -798,10 +786,6 @@ static void init_once(void *foo)
        init_rwsem(&jfs_ip->xattr_sem);
        spin_lock_init(&jfs_ip->ag_lock);
        jfs_ip->active_ag = -1;
-#ifdef CONFIG_JFS_POSIX_ACL
-        jfs_ip->i_acl = JFS_ACL_NOT_CACHED;
-        jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED;
-#endif
        inode_init_once(&jfs_ip->vfs_inode);
 }
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 61dfa8173ebc..fad364548bc9 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -727,10 +727,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
                /*
                 * We're changing the ACL.  Get rid of the cached one
                 */
-                acl =JFS_IP(inode)->i_acl;
+                forget_cached_acl(inode, ACL_TYPE_ACCESS);
-                if (acl != JFS_ACL_NOT_CACHED)
-                        posix_acl_release(acl);
-                JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED;
                return 0;
        } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
@@ -746,10 +743,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
                /*
                 * We're changing the default ACL.  Get rid of the cached one
                 */
-                acl =JFS_IP(inode)->i_default_acl;
+                forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-                if (acl && (acl != JFS_ACL_NOT_CACHED))
-                        posix_acl_release(acl);
-                JFS_IP(inode)->i_default_acl = JFS_ACL_NOT_CACHED;
                return 0;
        }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index dd7957064a8c..4336adba952a 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/module.h>
+#include <linux/smp_lock.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -126,7 +127,6 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
        struct nlm_lock *lock = &argp->lock;
        nlmclnt_next_cookie(&argp->cookie);
-        argp->state   = nsm_local_state;
        memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
        lock->caller  = utsname()->nodename;
        lock->oh.data = req->a_owner;
@@ -165,6 +165,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        /* Set up the argument struct */
        nlmclnt_setlockargs(call, fl);
+        lock_kernel();
        if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
                if (fl->fl_type != F_UNLCK) {
                        call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -178,6 +179,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        fl->fl_ops->fl_release_private(fl);
        fl->fl_ops = NULL;
+        unlock_kernel();
        dprintk("lockd: clnt proc returns %d\n", status);
        return status;
@@ -519,6 +521,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        if (nsm_monitor(host) < 0)
                goto out;
+        req->a_args.state = nsm_local_state;
        fl->fl_flags |= FL_ACCESS;
        status = do_vfs_lock(fl);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6d5d4a4169e5..7fce1b525849 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -53,7 +53,7 @@ static				DEFINE_SPINLOCK(nsm_lock);
 /*
 * Local NSM state
 */
-int     __read_mostly           nsm_local_state;
+u32     __read_mostly           nsm_local_state;
 int     __read_mostly           nsm_use_hostnames;
 static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
@@ -112,6 +112,7 @@ static struct rpc_clnt *nsm_create(void)
                .program                = &nsm_program,
                .version                = NSM_VERSION,
                .authflavor             = RPC_AUTH_NULL,
+                .flags                  = RPC_CLNT_CREATE_NOPING,
        };
        return rpc_create(&args);
@@ -184,13 +185,19 @@ int nsm_monitor(const struct nlm_host *host)
        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
-        if (res.status != 0)
+        if (unlikely(res.status != 0))
                status = -EIO;
-        if (status < 0)
+        if (unlikely(status < 0)) {
                printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
-        else
+                return status;
-                nsm->sm_monitored = 1;
+        }
-        return status;
+        nsm->sm_monitored = 1;
+        if (unlikely(nsm_local_state != res.state)) {
+                nsm_local_state = res.state;
+                dprintk("lockd: NSM state changed to %d\n", nsm_local_state);
+        }
+        return 0;
 }
 /**
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 1725037374c5..bd173a6ca3b1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/in.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 83ee34203bd7..e577a78d7bac 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -326,6 +326,8 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
 {
        if (call->a_args.lock.oh.data != call->a_owner)
                kfree(call->a_args.lock.oh.data);
+        locks_release_private(&call->a_args.lock.fl);
 }
 /*
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3688e55901fc..e1d28ddd2169 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/in.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index ec3deea29e37..b6440f52178f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -151,7 +151,7 @@ static struct file_lock *locks_alloc_lock(void)
        return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
-static void locks_release_private(struct file_lock *fl)
+void locks_release_private(struct file_lock *fl)
 {
        if (fl->fl_ops) {
                if (fl->fl_ops->fl_release_private)
@@ -165,6 +165,7 @@ static void locks_release_private(struct file_lock *fl)
        }
 }
+EXPORT_SYMBOL_GPL(locks_release_private);
 /* Free a lock which is not in use. */
 static void locks_free_lock(struct file_lock *fl)
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3aebe322271a..6ac693faae49 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -12,13 +12,14 @@
 /* bitmap.c contains the code that handles the inode and block bitmaps */
 #include "minix.h"
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 #include <linux/sched.h>
 static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
+static DEFINE_SPINLOCK(bitmap_lock);
 static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
 {
        unsigned i, j, sum = 0;
@@ -69,11 +70,11 @@ void minix_free_block(struct inode *inode, unsigned long block)
                return;
        }
        bh = sbi->s_zmap[zone];
-        lock_kernel();
+        spin_lock(&bitmap_lock);
        if (!minix_test_and_clear_bit(bit, bh->b_data))
                printk("minix_free_block (%s:%lu): bit already cleared\n",
                       sb->s_id, block);
-        unlock_kernel();
+        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
        return;
 }
@@ -88,18 +89,18 @@ int minix_new_block(struct inode * inode)
                struct buffer_head *bh = sbi->s_zmap[i];
                int j;
-                lock_kernel();
+                spin_lock(&bitmap_lock);
                j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
                if (j < bits_per_zone) {
                        minix_set_bit(j, bh->b_data);
-                        unlock_kernel();
+                        spin_unlock(&bitmap_lock);
                        mark_buffer_dirty(bh);
                        j += i * bits_per_zone + sbi->s_firstdatazone-1;
                        if (j < sbi->s_firstdatazone || j >= sbi->s_nzones)
                                break;
                        return j;
                }
-                unlock_kernel();
+                spin_unlock(&bitmap_lock);
        }
        return 0;
 }
@@ -211,10 +212,10 @@ void minix_free_inode(struct inode * inode)
        minix_clear_inode(inode);       /* clear on-disk copy */
        bh = sbi->s_imap[ino];
-        lock_kernel();
+        spin_lock(&bitmap_lock);
        if (!minix_test_and_clear_bit(bit, bh->b_data))
                printk("minix_free_inode: bit %lu already cleared\n", bit);
-        unlock_kernel();
+        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
 out:
        clear_inode(inode);             /* clear in-memory copy */
@@ -237,7 +238,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
        j = bits_per_zone;
        bh = NULL;
        *error = -ENOSPC;
-        lock_kernel();
+        spin_lock(&bitmap_lock);
        for (i = 0; i < sbi->s_imap_blocks; i++) {
                bh = sbi->s_imap[i];
                j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
@@ -245,17 +246,17 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
                        break;
        }
        if (!bh || j >= bits_per_zone) {
-                unlock_kernel();
+                spin_unlock(&bitmap_lock);
                iput(inode);
                return NULL;
        }
        if (minix_test_and_set_bit(j, bh->b_data)) {    /* shouldn't happen */
-                unlock_kernel();
+                spin_unlock(&bitmap_lock);
                printk("minix_new_inode: bit already set\n");
                iput(inode);
                return NULL;
        }
-        unlock_kernel();
+        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
        j += i * bits_per_zone;
        if (!j || j > sbi->s_ninodes) {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index e5f206467e40..d407e7a0b6fe 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -11,7 +11,6 @@
 #include "minix.h"
 #include <linux/buffer_head.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 typedef struct minix_dir_entry minix_dirent;
@@ -20,6 +19,7 @@ typedef struct minix3_dir_entry minix3_dirent;
 static int minix_readdir(struct file *, void *, filldir_t);
 const struct file_operations minix_dir_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
        .fsync          = simple_fsync,
@@ -102,8 +102,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
        char *name;
        __u32 inumber;
-        lock_kernel();
        pos = (pos + chunk_size-1) & ~(chunk_size-1);
        if (pos >= inode->i_size)
                goto done;
@@ -146,7 +144,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
 done:
        filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f91a23693597..74ea82d72164 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -35,8 +35,6 @@ static void minix_put_super(struct super_block *sb)
        int i;
        struct minix_sb_info *sbi = minix_sb(sb);
-        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                if (sbi->s_version != MINIX_V3)  /* s_state is now out from V3 sb */
                        sbi->s_ms->s_state = sbi->s_mount_state;
@@ -50,8 +48,6 @@ static void minix_put_super(struct super_block *sb)
        kfree(sbi->s_imap);
        sb->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache * minix_inode_cachep;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index cb7fdd11f9a5..9dcf95b42116 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -1,3 +1,6 @@
+#ifndef FS_MINIX_H
+#define FS_MINIX_H
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/minix_fs.h>
@@ -86,3 +89,5 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
 {
        return list_entry(inode, struct minix_inode_info, vfs_inode);
 }
+#endif /* FS_MINIX_H */
diff --git a/fs/namei.c b/fs/namei.c
index 527119afb6a5..f3c5b278895a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1698,8 +1698,11 @@ struct file *do_filp_open(int dfd, const char *pathname,
        if (error)
                return ERR_PTR(error);
        error = path_walk(pathname, &nd);
-        if (error)
+        if (error) {
+                if (nd.root.mnt)
+                        path_put(&nd.root);
                return ERR_PTR(error);
+        }
        if (unlikely(!audit_dummy_context()))
                audit_inode(pathname, nd.path.dentry);
@@ -1758,7 +1761,13 @@ do_last:
                        goto exit;
                }
                filp = nameidata_to_filp(&nd, open_flag);
+                if (IS_ERR(filp))
+                        ima_counts_put(&nd.path,
+                                       acc_mode & (MAY_READ | MAY_WRITE |
+                                                   MAY_EXEC));
                mnt_drop_write(nd.path.mnt);
+                if (nd.root.mnt)
+                        path_put(&nd.root);
                return filp;
        }
@@ -1812,6 +1821,9 @@ ok:
                goto exit;
        }
        filp = nameidata_to_filp(&nd, open_flag);
+        if (IS_ERR(filp))
+                ima_counts_put(&nd.path,
+                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
        /*
         * It is now safe to drop the mnt write
         * because the filp has had a write taken
@@ -1819,6 +1831,8 @@ ok:
         */
        if (will_write)
                mnt_drop_write(nd.path.mnt);
+        if (nd.root.mnt)
+                path_put(&nd.root);
        return filp;
 exit_mutex_unlock:
@@ -1859,6 +1873,8 @@ do_link:
                 * with "intent.open".
                 */
                release_open_intent(&nd);
+                if (nd.root.mnt)
+                        path_put(&nd.root);
                return ERR_PTR(error);
        }
        nd.flags &= ~LOOKUP_PARENT;
diff --git a/fs/namespace.c b/fs/namespace.c
index 2dd333b0fe7f..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/mnt_namespace.h>
 #include <linux/namei.h>
+#include <linux/nsproxy.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/ramfs.h>
@@ -42,6 +43,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static int mnt_id_start = 0;
+static int mnt_group_start = 1;
 static struct list_head *mount_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
@@ -69,7 +72,9 @@ static int mnt_alloc_id(struct vfsmount *mnt)
 retry:
        ida_pre_get(&mnt_id_ida, GFP_KERNEL);
        spin_lock(&vfsmount_lock);
-        res = ida_get_new(&mnt_id_ida, &mnt->mnt_id);
+        res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
+        if (!res)
+                mnt_id_start = mnt->mnt_id + 1;
        spin_unlock(&vfsmount_lock);
        if (res == -EAGAIN)
                goto retry;
@@ -79,8 +84,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
+        int id = mnt->mnt_id;
        spin_lock(&vfsmount_lock);
-        ida_remove(&mnt_id_ida, mnt->mnt_id);
+        ida_remove(&mnt_id_ida, id);
+        if (mnt_id_start > id)
+                mnt_id_start = id;
        spin_unlock(&vfsmount_lock);
 }
@@ -91,10 +99,18 @@ static void mnt_free_id(struct vfsmount *mnt)
 */
 static int mnt_alloc_group_id(struct vfsmount *mnt)
 {
+        int res;
        if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
                return -ENOMEM;
-        return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id);
+        res = ida_get_new_above(&mnt_group_ida,
+                                mnt_group_start,
+                                &mnt->mnt_group_id);
+        if (!res)
+                mnt_group_start = mnt->mnt_group_id + 1;
+        return res;
 }
 /*
@@ -102,7 +118,10 @@ static int mnt_alloc_group_id(struct vfsmount *mnt)
 */
 void mnt_release_group_id(struct vfsmount *mnt)
 {
-        ida_remove(&mnt_group_ida, mnt->mnt_group_id);
+        int id = mnt->mnt_group_id;
+        ida_remove(&mnt_group_ida, id);
+        if (mnt_group_start > id)
+                mnt_group_start = id;
        mnt->mnt_group_id = 0;
 }
@@ -297,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
 */
 int mnt_want_write_file(struct file *file)
 {
-        if (!(file->f_mode & FMODE_WRITE))
+        struct inode *inode = file->f_dentry->d_inode;
+        if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
                return mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
@@ -1937,6 +1957,21 @@ dput_out:
        return retval;
 }
+static struct mnt_namespace *alloc_mnt_ns(void)
+{
+        struct mnt_namespace *new_ns;
+        new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+        if (!new_ns)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&new_ns->count, 1);
+        new_ns->root = NULL;
+        INIT_LIST_HEAD(&new_ns->list);
+        init_waitqueue_head(&new_ns->poll);
+        new_ns->event = 0;
+        return new_ns;
+}
 /*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
@@ -1948,14 +1983,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct vfsmount *p, *q;
-        new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+        new_ns = alloc_mnt_ns();
-        if (!new_ns)
+        if (IS_ERR(new_ns))
-                return ERR_PTR(-ENOMEM);
+                return new_ns;
-        atomic_set(&new_ns->count, 1);
-        INIT_LIST_HEAD(&new_ns->list);
-        init_waitqueue_head(&new_ns->poll);
-        new_ns->event = 0;
        down_write(&namespace_sem);
        /* First pass: copy the tree topology */
@@ -2019,6 +2049,24 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
        return new_ns;
 }
+/**
+ * create_mnt_ns - creates a private namespace and adds a root filesystem
+ * @mnt: pointer to the new root filesystem mountpoint
+ */
+struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
+{
+        struct mnt_namespace *new_ns;
+        new_ns = alloc_mnt_ns();
+        if (!IS_ERR(new_ns)) {
+                mnt->mnt_ns = new_ns;
+                new_ns->root = mnt;
+                list_add(&new_ns->list, &new_ns->root->mnt_list);
+        }
+        return new_ns;
+}
+EXPORT_SYMBOL(create_mnt_ns);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
 {
@@ -2194,16 +2242,9 @@ static void __init init_mount_tree(void)
        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
-        ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+        ns = create_mnt_ns(mnt);
-        if (!ns)
+        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
-        atomic_set(&ns->count, 1);
-        INIT_LIST_HEAD(&ns->list);
-        init_waitqueue_head(&ns->poll);
-        ns->event = 0;
-        list_add(&mnt->mnt_list, &ns->list);
-        ns->root = mnt;
-        mnt->mnt_ns = ns;
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);
@@ -2246,10 +2287,14 @@ void __init mnt_init(void)
        init_mount_tree();
 }
-void __put_mnt_ns(struct mnt_namespace *ns)
+void put_mnt_ns(struct mnt_namespace *ns)
 {
-        struct vfsmount *root = ns->root;
+        struct vfsmount *root;
        LIST_HEAD(umount_list);
+        if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
+                return;
+        root = ns->root;
        ns->root = NULL;
        spin_unlock(&vfsmount_lock);
        down_write(&namespace_sem);
@@ -2260,3 +2305,4 @@ void __put_mnt_ns(struct mnt_namespace *ns)
        release_mounts(&umount_list);
        kfree(ns);
 }
+EXPORT_SYMBOL(put_mnt_ns);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 97645f112114..0ec6237a5970 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
                if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                        int k;
+                        unicode_t u;
-                        k = utf8_mbtowc(&ec, iname, iname_end - iname);
+                        k = utf8_to_utf32(iname, iname_end - iname, &u);
-                        if (k < 0)
+                        if (k < 0 || u > MAX_WCHAR_T)
                                return -EINVAL;
                        iname += k;
+                        ec = u;
                } else {
                        if (*iname == NCP_ESC) {
                                int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
                if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                        int k;
-                        k = utf8_wctomb(iname, ec, iname_end - iname);
+                        k = utf32_to_utf8(ec, iname, iname_end - iname);
                        if (k < 0) {
                                err = -ENAMETOOLONG;
                                goto quit;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index e67f3ec07736..2a77bc25d5af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,6 @@
 config NFS_FS
        tristate "NFS client support"
-        depends on INET
+        depends on INET && FILE_LOCKING
        select LOCKD
        select SUNRPC
        select NFS_ACL_SUPPORT if NFS_V3_ACL
@@ -74,6 +74,15 @@ config NFS_V4
          If unsure, say N.
+config NFS_V4_1
+        bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)"
+        depends on NFS_V4 && EXPERIMENTAL
+        help
+          This option enables support for minor version 1 of the NFSv4 protocol
+          (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+          Unless you're an NFS developer, say N.
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a886e692ddd0..7f604c7941fb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,6 +17,9 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#if defined(CONFIG_NFS_V4_1)
+#include <linux/sunrpc/bc_xprt.h>
+#endif
 #include <net/inet_sock.h>
@@ -28,11 +31,12 @@
 struct nfs_callback_data {
        unsigned int users;
+        struct svc_serv *serv;
        struct svc_rqst *rqst;
        struct task_struct *task;
 };
-static struct nfs_callback_data nfs_callback_info;
+static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
 static DEFINE_MUTEX(nfs_callback_mutex);
 static struct svc_program nfs4_callback_program;
@@ -56,10 +60,10 @@ module_param_call(callback_tcpport, param_set_port, param_get_int,
                 &nfs_callback_set_tcpport, 0644);
 /*
- * This is the callback kernel thread.
+ * This is the NFSv4 callback kernel thread.
 */
 static int
-nfs_callback_svc(void *vrqstp)
+nfs4_callback_svc(void *vrqstp)
 {
        int err, preverr = 0;
        struct svc_rqst *rqstp = vrqstp;
@@ -97,20 +101,12 @@ nfs_callback_svc(void *vrqstp)
 }
 /*
- * Bring up the callback thread if it is not already up.
+ * Prepare to bring up the NFSv4 callback service
 */
-int nfs_callback_up(void)
+struct svc_rqst *
+nfs4_callback_up(struct svc_serv *serv)
 {
-        struct svc_serv *serv = NULL;
+        int ret;
-        int ret = 0;
-        mutex_lock(&nfs_callback_mutex);
-        if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
-                goto out;
-        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
-        ret = -ENOMEM;
-        if (!serv)
-                goto out_err;
        ret = svc_create_xprt(serv, "tcp", PF_INET,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
@@ -127,27 +123,174 @@ int nfs_callback_up(void)
                nfs_callback_tcpport6 = ret;
                dprintk("NFS: Callback listener port = %u (af %u)\n",
                                nfs_callback_tcpport6, PF_INET6);
-        } else if (ret != -EAFNOSUPPORT)
+        } else if (ret == -EAFNOSUPPORT)
+                ret = 0;
+        else
                goto out_err;
 #endif  /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
-        nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        return svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(nfs_callback_info.rqst)) {
-                ret = PTR_ERR(nfs_callback_info.rqst);
+out_err:
-                nfs_callback_info.rqst = NULL;
+        if (ret == 0)
+                ret = -ENOMEM;
+        return ERR_PTR(ret);
+}
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * The callback service for NFSv4.1 callbacks
+ */
+static int
+nfs41_callback_svc(void *vrqstp)
+{
+        struct svc_rqst *rqstp = vrqstp;
+        struct svc_serv *serv = rqstp->rq_server;
+        struct rpc_rqst *req;
+        int error;
+        DEFINE_WAIT(wq);
+        set_freezable();
+        /*
+         * FIXME: do we really need to run this under the BKL? If so, please
+         * add a comment about what it's intended to protect.
+         */
+        lock_kernel();
+        while (!kthread_should_stop()) {
+                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+                spin_lock_bh(&serv->sv_cb_lock);
+                if (!list_empty(&serv->sv_cb_list)) {
+                        req = list_first_entry(&serv->sv_cb_list,
+                                        struct rpc_rqst, rq_bc_list);
+                        list_del(&req->rq_bc_list);
+                        spin_unlock_bh(&serv->sv_cb_lock);
+                        dprintk("Invoking bc_svc_process()\n");
+                        error = bc_svc_process(serv, req, rqstp);
+                        dprintk("bc_svc_process() returned w/ error code= %d\n",
+                                error);
+                } else {
+                        spin_unlock_bh(&serv->sv_cb_lock);
+                        schedule();
+                }
+                finish_wait(&serv->sv_cb_waitq, &wq);
+        }
+        unlock_kernel();
+        return 0;
+}
+/*
+ * Bring up the NFSv4.1 callback service
+ */
+struct svc_rqst *
+nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
+{
+        struct svc_xprt *bc_xprt;
+        struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+        dprintk("--> %s\n", __func__);
+        /* Create a svc_sock for the service */
+        bc_xprt = svc_sock_create(serv, xprt->prot);
+        if (!bc_xprt)
+                goto out;
+        /*
+         * Save the svc_serv in the transport so that it can
+         * be referenced when the session backchannel is initialized
+         */
+        serv->bc_xprt = bc_xprt;
+        xprt->bc_serv = serv;
+        INIT_LIST_HEAD(&serv->sv_cb_list);
+        spin_lock_init(&serv->sv_cb_lock);
+        init_waitqueue_head(&serv->sv_cb_waitq);
+        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        if (IS_ERR(rqstp))
+                svc_sock_destroy(bc_xprt);
+out:
+        dprintk("--> %s return %p\n", __func__, rqstp);
+        return rqstp;
+}
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+                struct svc_serv *serv, struct rpc_xprt *xprt,
+                struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+        if (minorversion) {
+                *rqstpp = nfs41_callback_up(serv, xprt);
+                *callback_svc = nfs41_callback_svc;
+        }
+        return minorversion;
+}
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+                struct nfs_callback_data *cb_info)
+{
+        if (minorversion)
+                xprt->bc_serv = cb_info->serv;
+}
+#else
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+                struct svc_serv *serv, struct rpc_xprt *xprt,
+                struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+        return 0;
+}
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+                struct nfs_callback_data *cb_info)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+        struct svc_serv *serv = NULL;
+        struct svc_rqst *rqstp;
+        int (*callback_svc)(void *vrqstp);
+        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+        char svc_name[12];
+        int ret = 0;
+        int minorversion_setup;
+        mutex_lock(&nfs_callback_mutex);
+        if (cb_info->users++ || cb_info->task != NULL) {
+                nfs_callback_bc_serv(minorversion, xprt, cb_info);
+                goto out;
+        }
+        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+        if (!serv) {
+                ret = -ENOMEM;
+                goto out_err;
+        }
+        minorversion_setup =  nfs_minorversion_callback_svc_setup(minorversion,
+                                        serv, xprt, &rqstp, &callback_svc);
+        if (!minorversion_setup) {
+                /* v4.0 callback setup */
+                rqstp = nfs4_callback_up(serv);
+                callback_svc = nfs4_callback_svc;
+        }
+        if (IS_ERR(rqstp)) {
+                ret = PTR_ERR(rqstp);
                goto out_err;
        }
        svc_sock_update_bufs(serv);
-        nfs_callback_info.task = kthread_run(nfs_callback_svc,
+        sprintf(svc_name, "nfsv4.%u-svc", minorversion);
-                                             nfs_callback_info.rqst,
+        cb_info->serv = serv;
-                                             "nfsv4-svc");
+        cb_info->rqst = rqstp;
-        if (IS_ERR(nfs_callback_info.task)) {
+        cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
-                ret = PTR_ERR(nfs_callback_info.task);
+        if (IS_ERR(cb_info->task)) {
-                svc_exit_thread(nfs_callback_info.rqst);
+                ret = PTR_ERR(cb_info->task);
-                nfs_callback_info.rqst = NULL;
+                svc_exit_thread(cb_info->rqst);
-                nfs_callback_info.task = NULL;
+                cb_info->rqst = NULL;
+                cb_info->task = NULL;
                goto out_err;
        }
 out:
@@ -164,22 +307,25 @@ out:
 out_err:
        dprintk("NFS: Couldn't create callback socket or server thread; "
                "err = %d\n", ret);
-        nfs_callback_info.users--;
+        cb_info->users--;
        goto out;
 }
 /*
 * Kill the callback thread if it's no longer being used.
 */
-void nfs_callback_down(void)
+void nfs_callback_down(int minorversion)
 {
+        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
        mutex_lock(&nfs_callback_mutex);
-        nfs_callback_info.users--;
+        cb_info->users--;
-        if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
+        if (cb_info->users == 0 && cb_info->task != NULL) {
-                kthread_stop(nfs_callback_info.task);
+                kthread_stop(cb_info->task);
-                svc_exit_thread(nfs_callback_info.rqst);
+                svc_exit_thread(cb_info->rqst);
-                nfs_callback_info.rqst = NULL;
+                cb_info->serv = NULL;
-                nfs_callback_info.task = NULL;
+                cb_info->rqst = NULL;
+                cb_info->task = NULL;
        }
        mutex_unlock(&nfs_callback_mutex);
 }
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index e110e286a262..07baa8254ca1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -20,13 +20,24 @@ enum nfs4_callback_procnum {
 enum nfs4_callback_opnum {
        OP_CB_GETATTR = 3,
        OP_CB_RECALL  = 4,
+/* Callback operations new to NFSv4.1 */
+        OP_CB_LAYOUTRECALL  = 5,
+        OP_CB_NOTIFY        = 6,
+        OP_CB_PUSH_DELEG    = 7,
+        OP_CB_RECALL_ANY    = 8,
+        OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+        OP_CB_RECALL_SLOT   = 10,
+        OP_CB_SEQUENCE      = 11,
+        OP_CB_WANTS_CANCELLED = 12,
+        OP_CB_NOTIFY_LOCK   = 13,
+        OP_CB_NOTIFY_DEVICEID = 14,
        OP_CB_ILLEGAL = 10044,
 };
 struct cb_compound_hdr_arg {
        unsigned int taglen;
        const char *tag;
-        unsigned int callback_ident;
+        unsigned int minorversion;
        unsigned nops;
 };
@@ -59,16 +70,59 @@ struct cb_recallargs {
        uint32_t truncate;
 };
+#if defined(CONFIG_NFS_V4_1)
+struct referring_call {
+        uint32_t                        rc_sequenceid;
+        uint32_t                        rc_slotid;
+};
+struct referring_call_list {
+        struct nfs4_sessionid           rcl_sessionid;
+        uint32_t                        rcl_nrefcalls;
+        struct referring_call           *rcl_refcalls;
+};
+struct cb_sequenceargs {
+        struct sockaddr                 *csa_addr;
+        struct nfs4_sessionid           csa_sessionid;
+        uint32_t                        csa_sequenceid;
+        uint32_t                        csa_slotid;
+        uint32_t                        csa_highestslotid;
+        uint32_t                        csa_cachethis;
+        uint32_t                        csa_nrclists;
+        struct referring_call_list      *csa_rclists;
+};
+struct cb_sequenceres {
+        __be32                          csr_status;
+        struct nfs4_sessionid           csr_sessionid;
+        uint32_t                        csr_sequenceid;
+        uint32_t                        csr_slotid;
+        uint32_t                        csr_highestslotid;
+        uint32_t                        csr_target_highestslotid;
+};
+extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+                                       struct cb_sequenceres *res);
+#endif /* CONFIG_NFS_V4_1 */
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
 extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 #ifdef CONFIG_NFS_V4
-extern int nfs_callback_up(void);
+extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
-extern void nfs_callback_down(void);
+extern void nfs_callback_down(int minorversion);
-#else
+#endif /* CONFIG_NFS_V4 */
-#define nfs_callback_up()       (0)
-#define nfs_callback_down()     do {} while(0)
+/*
-#endif
+ * nfs41: Callbacks are expected to not cause substantial latency,
+ * so we limit their concurrency to 1 by setting up the maximum number
+ * of slots for the backchannel.
+ */
+#define NFS41_BC_MIN_CALLBACKS 1
+#define NFS41_BC_MAX_CALLBACKS 1
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f7e83e23cf9f..b7da1f54da68 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -101,3 +101,130 @@ out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
        return res;
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Validate the sequenceID sent by the server.
+ * Return success if the sequenceID is one more than what we last saw on
+ * this slot, accounting for wraparound.  Increments the slot's sequence.
+ *
+ * We don't yet implement a duplicate request cache, so at this time
+ * we will log replays, and process them as if we had not seen them before,
+ * but we don't bump the sequence in the slot.  Not too worried about it,
+ * since we only currently implement idempotent callbacks anyway.
+ *
+ * We have a single slot backchannel at this time, so we don't bother
+ * checking the used_slots bit array on the table.  The lower layer guarantees
+ * a single outstanding callback request at a time.
+ */
+static int
+validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
+{
+        struct nfs4_slot *slot;
+        dprintk("%s enter. slotid %d seqid %d\n",
+                __func__, slotid, seqid);
+        if (slotid > NFS41_BC_MAX_CALLBACKS)
+                return htonl(NFS4ERR_BADSLOT);
+        slot = tbl->slots + slotid;
+        dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
+        /* Normal */
+        if (likely(seqid == slot->seq_nr + 1)) {
+                slot->seq_nr++;
+                return htonl(NFS4_OK);
+        }
+        /* Replay */
+        if (seqid == slot->seq_nr) {
+                dprintk("%s seqid %d is a replay - no DRC available\n",
+                        __func__, seqid);
+                return htonl(NFS4_OK);
+        }
+        /* Wraparound */
+        if (seqid == 1 && (slot->seq_nr + 1) == 0) {
+                slot->seq_nr = 1;
+                return htonl(NFS4_OK);
+        }
+        /* Misordered request */
+        return htonl(NFS4ERR_SEQ_MISORDERED);
+}
+/*
+ * Returns a pointer to a held 'struct nfs_client' that matches the server's
+ * address, major version number, and session ID.  It is the caller's
+ * responsibility to release the returned reference.
+ *
+ * Returns NULL if there are no connections with sessions, or if no session
+ * matches the one of interest.
+ */
+ static struct nfs_client *find_client_with_session(
+        const struct sockaddr *addr, u32 nfsversion,
+        struct nfs4_sessionid *sessionid)
+{
+        struct nfs_client *clp;
+        clp = nfs_find_client(addr, 4);
+        if (clp == NULL)
+                return NULL;
+        do {
+                struct nfs_client *prev = clp;
+                if (clp->cl_session != NULL) {
+                        if (memcmp(clp->cl_session->sess_id.data,
+                                        sessionid->data,
+                                        NFS4_MAX_SESSIONID_LEN) == 0) {
+                                /* Returns a held reference to clp */
+                                return clp;
+                        }
+                }
+                clp = nfs_find_client_next(prev);
+                nfs_put_client(prev);
+        } while (clp != NULL);
+        return NULL;
+}
+/* FIXME: referring calls should be processed */
+unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+                                struct cb_sequenceres *res)
+{
+        struct nfs_client *clp;
+        int i, status;
+        for (i = 0; i < args->csa_nrclists; i++)
+                kfree(args->csa_rclists[i].rcl_refcalls);
+        kfree(args->csa_rclists);
+        status = htonl(NFS4ERR_BADSESSION);
+        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+        if (clp == NULL)
+                goto out;
+        status = validate_seqid(&clp->cl_session->bc_slot_table,
+                                args->csa_slotid, args->csa_sequenceid);
+        if (status)
+                goto out_putclient;
+        memcpy(&res->csr_sessionid, &args->csa_sessionid,
+               sizeof(res->csr_sessionid));
+        res->csr_sequenceid = args->csa_sequenceid;
+        res->csr_slotid = args->csa_slotid;
+        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+out_putclient:
+        nfs_put_client(clp);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        res->csr_status = status;
+        return res->csr_status;
+}
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index dd0ef34b5845..e5a2dac5f715 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -20,6 +20,11 @@
                                2 + 2 + 3 + 3)
 #define CB_OP_RECALL_RES_MAXSZ  (CB_OP_HDR_RES_MAXSZ)
+#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
+                                        4 + 1 + 3)
+#endif /* CONFIG_NFS_V4_1 */
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 typedef __be32 (*callback_process_op_t)(void *, void *);
@@ -132,7 +137,6 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
        __be32 *p;
-        unsigned int minor_version;
        __be32 status;
        status = decode_string(xdr, &hdr->taglen, &hdr->tag);
@@ -147,15 +151,19 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        p = read_buf(xdr, 12);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
-        minor_version = ntohl(*p++);
+        hdr->minorversion = ntohl(*p++);
-        /* Check minor version is zero. */
+        /* Check minor version is zero or one. */
-        if (minor_version != 0) {
+        if (hdr->minorversion <= 1) {
-                printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n",
+                p++;    /* skip callback_ident */
-                                __func__, minor_version);
+        } else {
+                printk(KERN_WARNING "%s: NFSv4 server callback with "
+                        "illegal minor version %u!\n",
+                        __func__, hdr->minorversion);
                return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
        }
-        hdr->callback_ident = ntohl(*p++);
        hdr->nops = ntohl(*p);
+        dprintk("%s: minorversion %d nops %d\n", __func__,
+                hdr->minorversion, hdr->nops);
        return 0;
 }
@@ -204,6 +212,122 @@ out:
        return status;
 }
+#if defined(CONFIG_NFS_V4_1)
+static unsigned decode_sessionid(struct xdr_stream *xdr,
+                                 struct nfs4_sessionid *sid)
+{
+        uint32_t *p;
+        int len = NFS4_MAX_SESSIONID_LEN;
+        p = read_buf(xdr, len);
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_RESOURCE);;
+        memcpy(sid->data, p, len);
+        return 0;
+}
+static unsigned decode_rc_list(struct xdr_stream *xdr,
+                               struct referring_call_list *rc_list)
+{
+        uint32_t *p;
+        int i;
+        unsigned status;
+        status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
+        if (status)
+                goto out;
+        status = htonl(NFS4ERR_RESOURCE);
+        p = read_buf(xdr, sizeof(uint32_t));
+        if (unlikely(p == NULL))
+                goto out;
+        rc_list->rcl_nrefcalls = ntohl(*p++);
+        if (rc_list->rcl_nrefcalls) {
+                p = read_buf(xdr,
+                             rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
+                if (unlikely(p == NULL))
+                        goto out;
+                rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+                                                sizeof(*rc_list->rcl_refcalls),
+                                                GFP_KERNEL);
+                if (unlikely(rc_list->rcl_refcalls == NULL))
+                        goto out;
+                for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
+                        rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
+                        rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
+                }
+        }
+        status = 0;
+out:
+        return status;
+}
+static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp,
+                                        struct xdr_stream *xdr,
+                                        struct cb_sequenceargs *args)
+{
+        uint32_t *p;
+        int i;
+        unsigned status;
+        status = decode_sessionid(xdr, &args->csa_sessionid);
+        if (status)
+                goto out;
+        status = htonl(NFS4ERR_RESOURCE);
+        p = read_buf(xdr, 5 * sizeof(uint32_t));
+        if (unlikely(p == NULL))
+                goto out;
+        args->csa_addr = svc_addr(rqstp);
+        args->csa_sequenceid = ntohl(*p++);
+        args->csa_slotid = ntohl(*p++);
+        args->csa_highestslotid = ntohl(*p++);
+        args->csa_cachethis = ntohl(*p++);
+        args->csa_nrclists = ntohl(*p++);
+        args->csa_rclists = NULL;
+        if (args->csa_nrclists) {
+                args->csa_rclists = kmalloc(args->csa_nrclists *
+                                            sizeof(*args->csa_rclists),
+                                            GFP_KERNEL);
+                if (unlikely(args->csa_rclists == NULL))
+                        goto out;
+                for (i = 0; i < args->csa_nrclists; i++) {
+                        status = decode_rc_list(xdr, &args->csa_rclists[i]);
+                        if (status)
+                                goto out_free;
+                }
+        }
+        status = 0;
+        dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
+                "highestslotid %u cachethis %d nrclists %u\n",
+                __func__,
+                ((u32 *)&args->csa_sessionid)[0],
+                ((u32 *)&args->csa_sessionid)[1],
+                ((u32 *)&args->csa_sessionid)[2],
+                ((u32 *)&args->csa_sessionid)[3],
+                args->csa_sequenceid, args->csa_slotid,
+                args->csa_highestslotid, args->csa_cachethis,
+                args->csa_nrclists);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+out_free:
+        for (i = 0; i < args->csa_nrclists; i++)
+                kfree(args->csa_rclists[i].rcl_refcalls);
+        kfree(args->csa_rclists);
+        goto out;
+}
+#endif /* CONFIG_NFS_V4_1 */
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
        __be32 *p;
@@ -353,31 +477,134 @@ out:
        return status;
 }
-static __be32 process_op(struct svc_rqst *rqstp,
+#if defined(CONFIG_NFS_V4_1)
+static unsigned encode_sessionid(struct xdr_stream *xdr,
+                                 const struct nfs4_sessionid *sid)
+{
+        uint32_t *p;
+        int len = NFS4_MAX_SESSIONID_LEN;
+        p = xdr_reserve_space(xdr, len);
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_RESOURCE);
+        memcpy(p, sid, len);
+        return 0;
+}
+static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       const struct cb_sequenceres *res)
+{
+        uint32_t *p;
+        unsigned status = res->csr_status;
+        if (unlikely(status != 0))
+                goto out;
+        encode_sessionid(xdr, &res->csr_sessionid);
+        p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_RESOURCE);
+        *p++ = htonl(res->csr_sequenceid);
+        *p++ = htonl(res->csr_slotid);
+        *p++ = htonl(res->csr_highestslotid);
+        *p++ = htonl(res->csr_target_highestslotid);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+        if (op_nr == OP_CB_SEQUENCE) {
+                if (nop != 0)
+                        return htonl(NFS4ERR_SEQUENCE_POS);
+        } else {
+                if (nop == 0)
+                        return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        }
+        switch (op_nr) {
+        case OP_CB_GETATTR:
+        case OP_CB_RECALL:
+        case OP_CB_SEQUENCE:
+                *op = &callback_ops[op_nr];
+                break;
+        case OP_CB_LAYOUTRECALL:
+        case OP_CB_NOTIFY_DEVICEID:
+        case OP_CB_NOTIFY:
+        case OP_CB_PUSH_DELEG:
+        case OP_CB_RECALL_ANY:
+        case OP_CB_RECALLABLE_OBJ_AVAIL:
+        case OP_CB_RECALL_SLOT:
+        case OP_CB_WANTS_CANCELLED:
+        case OP_CB_NOTIFY_LOCK:
+                return htonl(NFS4ERR_NOTSUPP);
+        default:
+                return htonl(NFS4ERR_OP_ILLEGAL);
+        }
+        return htonl(NFS_OK);
+}
+#else /* CONFIG_NFS_V4_1 */
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_1 */
+static __be32
+preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
+{
+        switch (op_nr) {
+        case OP_CB_GETATTR:
+        case OP_CB_RECALL:
+                *op = &callback_ops[op_nr];
+                break;
+        default:
+                return htonl(NFS4ERR_OP_ILLEGAL);
+        }
+        return htonl(NFS_OK);
+}
+static __be32 process_op(uint32_t minorversion, int nop,
+                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
                struct xdr_stream *xdr_out, void *resp)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr = OP_CB_ILLEGAL;
-        __be32 status = 0;
+        __be32 status;
        long maxlen;
        __be32 res;
        dprintk("%s: start\n", __func__);
        status = decode_op_hdr(xdr_in, &op_nr);
-        if (likely(status == 0)) {
+        if (unlikely(status)) {
-                switch (op_nr) {
+                status = htonl(NFS4ERR_OP_ILLEGAL);
-                        case OP_CB_GETATTR:
+                goto out;
-                        case OP_CB_RECALL:
-                                op = &callback_ops[op_nr];
-                                break;
-                        default:
-                                op_nr = OP_CB_ILLEGAL;
-                                op = &callback_ops[0];
-                                status = htonl(NFS4ERR_OP_ILLEGAL);
-                }
        }
+        dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
+                __func__, minorversion, nop, op_nr);
+        status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
+                                preprocess_nfs4_op(op_nr, &op);
+        if (status == htonl(NFS4ERR_OP_ILLEGAL))
+                op_nr = OP_CB_ILLEGAL;
+out:
        maxlen = xdr_out->end - xdr_out->p;
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
                if (likely(status == 0 && op->decode_args != NULL))
@@ -425,7 +652,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
                return rpc_system_err;
        while (status == 0 && nops != hdr_arg.nops) {
-                status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp);
+                status = process_op(hdr_arg.minorversion, nops,
+                                    rqstp, &xdr_in, argp, &xdr_out, resp);
                nops++;
        }
@@ -452,7 +680,15 @@ static struct callback_op callback_ops[] = {
                .process_op = (callback_process_op_t)nfs4_callback_recall,
                .decode_args = (callback_decode_arg_t)decode_recall_args,
                .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
-        }
+        },
+#if defined(CONFIG_NFS_V4_1)
+        [OP_CB_SEQUENCE] = {
+                .process_op = (callback_process_op_t)nfs4_callback_sequence,
+                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
+                .encode_res = (callback_encode_res_t)encode_cb_sequence_res,
+                .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
+        },
+#endif /* CONFIG_NFS_V4_1 */
 };
 /*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 75c9cd2aa119..8d25ccb2d51d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -37,6 +37,7 @@
 #include <linux/in6.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include <asm/system.h>
@@ -102,6 +103,7 @@ struct nfs_client_initdata {
        size_t addrlen;
        const struct nfs_rpc_ops *rpc_ops;
        int proto;
+        u32 minorversion;
 };
 /*
@@ -114,18 +116,13 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 {
        struct nfs_client *clp;
        struct rpc_cred *cred;
+        int err = -ENOMEM;
        if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
                goto error_0;
        clp->rpc_ops = cl_init->rpc_ops;
-        if (cl_init->rpc_ops->version == 4) {
-                if (nfs_callback_up() < 0)
-                        goto error_2;
-                __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
-        }
        atomic_set(&clp->cl_count, 1);
        clp->cl_cons_state = NFS_CS_INITING;
@@ -133,9 +130,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_addrlen = cl_init->addrlen;
        if (cl_init->hostname) {
+                err = -ENOMEM;
                clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
                if (!clp->cl_hostname)
-                        goto error_3;
+                        goto error_cleanup;
        }
        INIT_LIST_HEAD(&clp->cl_superblocks);
@@ -150,6 +148,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
        clp->cl_boot_time = CURRENT_TIME;
        clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+        clp->cl_minorversion = cl_init->minorversion;
 #endif
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
@@ -159,13 +158,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        return clp;
-error_3:
+error_cleanup:
-        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down();
-error_2:
        kfree(clp);
 error_0:
-        return NULL;
+        return ERR_PTR(err);
 }
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -182,12 +178,42 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 }
 /*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4
+        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+                nfs_callback_down(clp->cl_minorversion);
+#endif /* CONFIG_NFS_V4 */
+}
+/*
+ * Clears/puts all minor version specific parts from an nfs_client struct
+ * reverting it to minorversion 0.
+ */
+static void nfs4_clear_client_minor_version(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(clp)) {
+                nfs4_destroy_session(clp->cl_session);
+                clp->cl_session = NULL;
+        }
+        clp->cl_call_sync = _nfs4_call_sync;
+#endif /* CONFIG_NFS_V4_1 */
+        nfs4_destroy_callback(clp);
+}
+/*
 * Destroy a shared client record
 */
 static void nfs_free_client(struct nfs_client *clp)
 {
        dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
+        nfs4_clear_client_minor_version(clp);
        nfs4_shutdown_client(clp);
        nfs_fscache_release_client_cookie(clp);
@@ -196,9 +222,6 @@ static void nfs_free_client(struct nfs_client *clp)
        if (!IS_ERR(clp->cl_rpcclient))
                rpc_shutdown_client(clp->cl_rpcclient);
-        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down();
        if (clp->cl_machine_cred != NULL)
                put_rpccred(clp->cl_machine_cred);
@@ -347,7 +370,8 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
                /* Don't match clients that failed to initialise properly */
-                if (clp->cl_cons_state != NFS_CS_READY)
+                if (!(clp->cl_cons_state == NFS_CS_READY ||
+                      clp->cl_cons_state == NFS_CS_SESSION_INITING))
                        continue;
                /* Different NFS versions cannot share the same nfs_client */
@@ -420,7 +444,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
                if (clp->cl_proto != data->proto)
                        continue;
+                /* Match nfsv4 minorversion */
+                if (clp->cl_minorversion != data->minorversion)
+                        continue;
                /* Match the full socket address */
                if (!nfs_sockaddr_cmp(sap, clap))
                        continue;
@@ -456,9 +482,10 @@ static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_in
                spin_unlock(&nfs_client_lock);
                new = nfs_alloc_client(cl_init);
-        } while (new);
+        } while (!IS_ERR(new));
-        return ERR_PTR(-ENOMEM);
+        dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new));
+        return new;
        /* install a new client and return with it unready */
 install_client:
@@ -478,7 +505,7 @@ found_client:
                nfs_free_client(new);
        error = wait_event_killable(nfs_client_active_wq,
-                                clp->cl_cons_state != NFS_CS_INITING);
+                                clp->cl_cons_state < NFS_CS_INITING);
        if (error < 0) {
                nfs_put_client(clp);
                return ERR_PTR(-ERESTARTSYS);
@@ -499,13 +526,29 @@ found_client:
 /*
 * Mark a server as ready or failed
 */
-static void nfs_mark_client_ready(struct nfs_client *clp, int state)
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
 {
        clp->cl_cons_state = state;
        wake_up_all(&nfs_client_active_wq);
 }
 /*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+int nfs4_check_client_ready(struct nfs_client *clp)
+{
+        if (!nfs4_has_session(clp))
+                return 0;
+        if (clp->cl_cons_state < NFS_CS_READY)
+                return -EPROTONOSUPPORT;
+        return 0;
+}
+/*
 * Initialise the timeout values for a connection
 */
 static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
@@ -1050,6 +1093,61 @@ error:
 #ifdef CONFIG_NFS_V4
 /*
+ * Initialize the NFS4 callback service
+ */
+static int nfs4_init_callback(struct nfs_client *clp)
+{
+        int error;
+        if (clp->rpc_ops->version == 4) {
+                if (nfs4_has_session(clp)) {
+                        error = xprt_setup_backchannel(
+                                                clp->cl_rpcclient->cl_xprt,
+                                                NFS41_BC_MIN_CALLBACKS);
+                        if (error < 0)
+                                return error;
+                }
+                error = nfs_callback_up(clp->cl_minorversion,
+                                        clp->cl_rpcclient->cl_xprt);
+                if (error < 0) {
+                        dprintk("%s: failed to start callback. Error = %d\n",
+                                __func__, error);
+                        return error;
+                }
+                __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+        }
+        return 0;
+}
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+        clp->cl_call_sync = _nfs4_call_sync;
+#if defined(CONFIG_NFS_V4_1)
+        if (clp->cl_minorversion) {
+                struct nfs4_session *session = NULL;
+                /*
+                 * Create the session and mark it expired.
+                 * When a SEQUENCE operation encounters the expired session
+                 * it will do session recovery to initialize it.
+                 */
+                session = nfs4_alloc_session(clp);
+                if (!session)
+                        return -ENOMEM;
+                clp->cl_session = session;
+                clp->cl_call_sync = _nfs4_call_sync_session;
+        }
+#endif /* CONFIG_NFS_V4_1 */
+        return nfs4_init_callback(clp);
+}
+/*
 * Initialise an NFS4 client record
 */
 static int nfs4_init_client(struct nfs_client *clp,
@@ -1083,7 +1181,12 @@ static int nfs4_init_client(struct nfs_client *clp,
        }
        __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
-        nfs_mark_client_ready(clp, NFS_CS_READY);
+        error = nfs4_init_client_minor_version(clp);
+        if (error < 0)
+                goto error;
+        if (!nfs4_has_session(clp))
+                nfs_mark_client_ready(clp, NFS_CS_READY);
        return 0;
 error:
@@ -1101,7 +1204,8 @@ static int nfs4_set_client(struct nfs_server *server,
                const size_t addrlen,
                const char *ip_addr,
                rpc_authflavor_t authflavour,
-                int proto, const struct rpc_timeout *timeparms)
+                int proto, const struct rpc_timeout *timeparms,
+                u32 minorversion)
 {
        struct nfs_client_initdata cl_init = {
                .hostname = hostname,
@@ -1109,6 +1213,7 @@ static int nfs4_set_client(struct nfs_server *server,
                .addrlen = addrlen,
                .rpc_ops = &nfs_v4_clientops,
                .proto = proto,
+                .minorversion = minorversion,
        };
        struct nfs_client *clp;
        int error;
@@ -1137,6 +1242,22 @@ error:
        return error;
 }
+/*
+ * Session has been established, and the client marked ready.
+ * Set the mount rsize and wsize with negotiated fore channel
+ * attributes which will be bound checked in nfs_server_set_fsinfo.
+ */
+static void nfs4_session_set_rwsize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (!nfs4_has_session(server->nfs_client))
+                return;
+        server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz;
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * Create a version 4 volume record
 */
@@ -1164,7 +1285,8 @@ static int nfs4_init_server(struct nfs_server *server,
                        data->client_address,
                        data->auth_flavors[0],
                        data->nfs_server.protocol,
-                        &timeparms);
+                        &timeparms,
+                        data->minorversion);
        if (error < 0)
                goto error;
@@ -1214,6 +1336,10 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops);
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        error = nfs4_init_session(server);
+        if (error < 0)
+                goto error;
        /* Probe the root fh to retrieve its FSID */
        error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
        if (error < 0)
@@ -1224,6 +1350,8 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.minor);
        dprintk("Mount FH: %d\n", mntfh->size);
+        nfs4_session_set_rwsize(server);
        error = nfs_probe_fsinfo(server, mntfh, &fattr);
        if (error < 0)
                goto error;
@@ -1282,7 +1410,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
                                parent_client->cl_ipaddr,
                                data->authflavor,
                                parent_server->client->cl_xprt->prot,
-                                parent_server->client->cl_timeout);
+                                parent_server->client->cl_timeout,
+                                parent_client->cl_minorversion);
        if (error < 0)
                goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 968225a88015..6dd48a4405b4 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/nfs4.h>
@@ -68,29 +69,26 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 {
        struct inode *inode = state->inode;
        struct file_lock *fl;
-        int status;
+        int status = 0;
+        if (inode->i_flock == NULL)
+                goto out;
+        /* Protect inode->i_flock using the BKL */
+        lock_kernel();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
+                unlock_kernel();
                status = nfs4_lock_delegation_recall(state, fl);
-                if (status >= 0)
+                if (status < 0)
-                        continue;
+                        goto out;
-                switch (status) {
+                lock_kernel();
-                        default:
-                                printk(KERN_ERR "%s: unhandled error %d.\n",
-                                                __func__, status);
-                        case -NFS4ERR_EXPIRED:
-                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
-                        case -NFS4ERR_STALE_CLIENTID:
-                                nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs_client);
-                                goto out_err;
-                }
        }
-        return 0;
+        unlock_kernel();
-out_err:
+out:
        return status;
 }
@@ -268,7 +266,10 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
        struct nfs_inode *nfsi = NFS_I(inode);
        nfs_msync_inode(inode);
-        /* Guard against new delegated open calls */
+        /*
+         * Guard against new delegated open/lock/unlock calls and against
+         * state recovery
+         */
        down_write(&nfsi->rwsem);
        nfs_delegation_claim_opens(inode, &delegation->stateid);
        up_write(&nfsi->rwsem);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 89f98e9a024b..32062c33c859 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -29,7 +29,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/pagevec.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
@@ -1026,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                goto out;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 08f6b040d289..e4e089a8f294 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,10 +255,13 @@ static void nfs_direct_read_release(void *calldata)
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
-        nfs_readdata_release(calldata);
+        nfs_readdata_free(data);
 }
 static const struct rpc_call_ops nfs_read_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_read_result,
        .rpc_release = nfs_direct_read_release,
 };
@@ -311,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 1, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_readdata_release(data);
+                        nfs_readdata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_readdata_release(data);
+                                nfs_readdata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -331,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -438,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
                struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
                nfs_direct_release_pages(data->pagevec, data->npages);
-                nfs_writedata_release(data);
+                nfs_writedata_free(data);
        }
 }
@@ -531,10 +534,13 @@ static void nfs_direct_commit_release(void *calldata)
        dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
        nfs_direct_write_complete(dreq, data->inode);
-        nfs_commitdata_release(calldata);
+        nfs_commit_free(data);
 }
 static const struct rpc_call_ops nfs_commit_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_commit_result,
        .rpc_release = nfs_direct_commit_release,
 };
@@ -564,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->args.fh = NFS_FH(data->inode);
        data->args.offset = 0;
        data->args.count = 0;
-        data->args.context = get_nfs_open_context(dreq->ctx);
+        data->args.context = dreq->ctx;
        data->res.count = 0;
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
@@ -673,6 +679,9 @@ out_unlock:
 }
 static const struct rpc_call_ops nfs_write_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_write_result,
        .rpc_release = nfs_direct_write_release,
 };
@@ -725,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 0, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_writedata_release(data);
+                        nfs_writedata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_writedata_release(data);
+                                nfs_writedata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -747,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ec7e27d00bc6..05062329b678 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/aio.h>
 #include <asm/uaccess.h>
@@ -48,6 +47,9 @@ static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
                                        size_t count, unsigned int flags);
 static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
+static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
+                                        struct file *filp, loff_t *ppos,
+                                        size_t count, unsigned int flags);
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
@@ -73,6 +75,7 @@ const struct file_operations nfs_file_operations = {
        .lock           = nfs_lock,
        .flock          = nfs_flock,
        .splice_read    = nfs_file_splice_read,
+        .splice_write   = nfs_file_splice_write,
        .check_flags    = nfs_check_flags,
        .setlease       = nfs_setlease,
 };
@@ -587,12 +590,38 @@ out_swapfile:
        goto out;
 }
+static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
+                                     struct file *filp, loff_t *ppos,
+                                     size_t count, unsigned int flags)
+{
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        ssize_t ret;
+        dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
+                dentry->d_parent->d_name.name, dentry->d_name.name,
+                (unsigned long) count, (unsigned long long) *ppos);
+        /*
+         * The combination of splice and an O_APPEND destination is disallowed.
+         */
+        nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
+        ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
+        if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
+                int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
+                if (err < 0)
+                        ret = err;
+        }
+        return ret;
+}
 static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
-        lock_kernel();
        /* Try local locking first */
        posix_test_lock(filp, fl);
        if (fl->fl_type != F_UNLCK) {
@@ -608,7 +637,6 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
        status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 out:
-        unlock_kernel();
        return status;
 out_noconflict:
        fl->fl_type = F_UNLCK;
@@ -650,13 +678,11 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
         *      If we're signalled while cleaning up locks on process exit, we
         *      still need to complete the unlock.
         */
-        lock_kernel();
        /* Use local locking if mounted with "-onolock" */
        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
-        unlock_kernel();
        return status;
 }
@@ -673,13 +699,11 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
        if (status != 0)
                goto out;
-        lock_kernel();
        /* Use local locking if mounted with "-onolock" */
        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
-        unlock_kernel();
        if (status < 0)
                goto out;
        /*
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 46177cb87064..b35d2a616066 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -30,7 +30,6 @@
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/namei.h>
-#include <linux/mnt_namespace.h>
 #include <linux/security.h>
 #include <asm/system.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 64f87194d390..bd7938eda6a8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -30,7 +30,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e4d6a8348adf..7dd90a6769d0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -2,6 +2,7 @@
 * NFS internal definitions
 */
+#include "nfs4_fs.h"
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -17,6 +18,18 @@ struct nfs_string;
 */
 #define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (clp->cl_session)
+                return 1;
+#endif /* CONFIG_NFS_V4_1 */
+        return 0;
+}
 struct nfs_clone_mount {
        const struct super_block *sb;
        const struct dentry *dentry;
@@ -30,6 +43,12 @@ struct nfs_clone_mount {
 };
 /*
+ * Note: RFC 1813 doesn't limit the number of auth flavors that
+ * a server can return, so make something up.
+ */
+#define NFS_MAX_SECFLAVORS      (12)
+/*
 * In-kernel mount arguments
 */
 struct nfs_parsed_mount_data {
@@ -44,6 +63,7 @@ struct nfs_parsed_mount_data {
        unsigned int            auth_flavor_len;
        rpc_authflavor_t        auth_flavors[1];
        char                    *client_address;
+        unsigned int            minorversion;
        char                    *fscache_uniq;
        struct {
@@ -77,6 +97,8 @@ struct nfs_mount_request {
        unsigned short          protocol;
        struct nfs_fh           *fh;
        int                     noresvport;
+        unsigned int            *auth_flav_len;
+        rpc_authflavor_t        *auth_flavs;
 };
 extern int nfs_mount(struct nfs_mount_request *info);
@@ -99,6 +121,8 @@ extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           struct nfs_fh *,
                                           struct nfs_fattr *);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+extern int nfs4_check_client_ready(struct nfs_client *clp);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -146,6 +170,20 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
 extern struct rpc_procinfo nfs3_procedures[];
 extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
+/* nfs4proc.c */
+static inline void nfs4_restart_rpc(struct rpc_task *task,
+                                    const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(clp) &&
+            test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+                rpc_restart_call_prepare(task);
+                return;
+        }
+#endif /* CONFIG_NFS_V4_1 */
+        rpc_restart_call(task);
+}
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
 extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
@@ -205,6 +243,38 @@ extern int nfs4_path_walk(struct nfs_server *server,
                          const char *path);
 #endif
+/* read.c */
+extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+/* write.c */
+extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+/* nfs4proc.c */
+extern int _nfs4_call_sync(struct nfs_server *server,
+                           struct rpc_message *msg,
+                           struct nfs4_sequence_args *args,
+                           struct nfs4_sequence_res *res,
+                           int cache_reply);
+extern int _nfs4_call_sync_session(struct nfs_server *server,
+                                   struct rpc_message *msg,
+                                   struct nfs4_sequence_args *args,
+                                   struct nfs4_sequence_res *res,
+                                   int cache_reply);
+#ifdef CONFIG_NFS_V4_1
+extern void nfs41_sequence_free_slot(const struct nfs_client *,
+                                     struct nfs4_sequence_res *res);
+#endif /* CONFIG_NFS_V4_1 */
+static inline void nfs4_sequence_free_slot(const struct nfs_client *clp,
+                                           struct nfs4_sequence_res *res)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(clp))
+                nfs41_sequence_free_slot(clp, res);
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * Determine the device name as a string
 */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a2ab2529b5ca..ceda50aad73c 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server,
        cpu = get_cpu();
        iostats = per_cpu_ptr(server->io_stats, cpu);
        iostats->events[stat]++;
-        put_cpu_no_resched();
+        put_cpu();
 }
 static inline void nfs_inc_stats(const struct inode *inode,
@@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
        cpu = get_cpu();
        iostats = per_cpu_ptr(server->io_stats, cpu);
        iostats->bytes[stat] += addend;
-        put_cpu_no_resched();
+        put_cpu();
 }
 static inline void nfs_add_stats(const struct inode *inode,
@@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
        cpu = get_cpu();
        iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
        iostats->fscache[stat] += addend;
-        put_cpu_no_resched();
+        put_cpu();
 }
 #endif
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index ca905a5bb1ba..38ef9eaec407 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -20,8 +20,116 @@
 # define NFSDBG_FACILITY        NFSDBG_MOUNT
 #endif
+/*
+ * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
+ */
+#define MNTPATHLEN              (1024)
+/*
+ * XDR data type sizes
+ */
+#define encode_dirpath_sz       (1 + XDR_QUADLEN(MNTPATHLEN))
+#define MNT_status_sz           (1)
+#define MNT_fhs_status_sz       (1)
+#define MNT_fhandle_sz          XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandle3_sz         (1 + XDR_QUADLEN(NFS3_FHSIZE))
+#define MNT_authflav3_sz        (1 + NFS_MAX_SECFLAVORS)
+/*
+ * XDR argument and result sizes
+ */
+#define MNT_enc_dirpath_sz      encode_dirpath_sz
+#define MNT_dec_mountres_sz     (MNT_status_sz + MNT_fhandle_sz)
+#define MNT_dec_mountres3_sz    (MNT_status_sz + MNT_fhandle_sz + \
+                                 MNT_authflav3_sz)
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+        MOUNTPROC_NULL          = 0,
+        MOUNTPROC_MNT           = 1,
+        MOUNTPROC_DUMP          = 2,
+        MOUNTPROC_UMNT          = 3,
+        MOUNTPROC_UMNTALL       = 4,
+        MOUNTPROC_EXPORT        = 5,
+};
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+        MOUNTPROC3_NULL         = 0,
+        MOUNTPROC3_MNT          = 1,
+        MOUNTPROC3_DUMP         = 2,
+        MOUNTPROC3_UMNT         = 3,
+        MOUNTPROC3_UMNTALL      = 4,
+        MOUNTPROC3_EXPORT       = 5,
+};
 static struct rpc_program       mnt_program;
+/*
+ * Defined by OpenGroup XNFS Version 3W, chapter 8
+ */
+enum mountstat {
+        MNT_OK                  = 0,
+        MNT_EPERM               = 1,
+        MNT_ENOENT              = 2,
+        MNT_EACCES              = 13,
+        MNT_EINVAL              = 22,
+};
+static struct {
+        u32 status;
+        int errno;
+} mnt_errtbl[] = {
+        { .status = MNT_OK,                     .errno = 0,             },
+        { .status = MNT_EPERM,                  .errno = -EPERM,        },
+        { .status = MNT_ENOENT,                 .errno = -ENOENT,       },
+        { .status = MNT_EACCES,                 .errno = -EACCES,       },
+        { .status = MNT_EINVAL,                 .errno = -EINVAL,       },
+};
+/*
+ * Defined by RFC 1813, section 5.1.5
+ */
+enum mountstat3 {
+        MNT3_OK                 = 0,            /* no error */
+        MNT3ERR_PERM            = 1,            /* Not owner */
+        MNT3ERR_NOENT           = 2,            /* No such file or directory */
+        MNT3ERR_IO              = 5,            /* I/O error */
+        MNT3ERR_ACCES           = 13,           /* Permission denied */
+        MNT3ERR_NOTDIR          = 20,           /* Not a directory */
+        MNT3ERR_INVAL           = 22,           /* Invalid argument */
+        MNT3ERR_NAMETOOLONG     = 63,           /* Filename too long */
+        MNT3ERR_NOTSUPP         = 10004,        /* Operation not supported */
+        MNT3ERR_SERVERFAULT     = 10006,        /* A failure on the server */
+};
+static struct {
+        u32 status;
+        int errno;
+} mnt3_errtbl[] = {
+        { .status = MNT3_OK,                    .errno = 0,             },
+        { .status = MNT3ERR_PERM,               .errno = -EPERM,        },
+        { .status = MNT3ERR_NOENT,              .errno = -ENOENT,       },
+        { .status = MNT3ERR_IO,                 .errno = -EIO,          },
+        { .status = MNT3ERR_ACCES,              .errno = -EACCES,       },
+        { .status = MNT3ERR_NOTDIR,             .errno = -ENOTDIR,      },
+        { .status = MNT3ERR_INVAL,              .errno = -EINVAL,       },
+        { .status = MNT3ERR_NAMETOOLONG,        .errno = -ENAMETOOLONG, },
+        { .status = MNT3ERR_NOTSUPP,            .errno = -ENOTSUPP,     },
+        { .status = MNT3ERR_SERVERFAULT,        .errno = -ESERVERFAULT, },
+};
+struct mountres {
+        int errno;
+        struct nfs_fh *fh;
+        unsigned int *auth_count;
+        rpc_authflavor_t *auth_flavors;
+};
 struct mnt_fhstatus {
        u32 status;
        struct nfs_fh *fh;
@@ -35,8 +143,10 @@ struct mnt_fhstatus {
 */
 int nfs_mount(struct nfs_mount_request *info)
 {
-        struct mnt_fhstatus     result = {
+        struct mountres result = {
-                .fh             = info->fh
+                .fh             = info->fh,
+                .auth_count     = info->auth_flav_len,
+                .auth_flavors   = info->auth_flavs,
        };
        struct rpc_message msg  = {
                .rpc_argp       = info->dirpath,
@@ -68,14 +178,14 @@ int nfs_mount(struct nfs_mount_request *info)
        if (info->version == NFS_MNT3_VERSION)
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
        else
-                msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
+                msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
        status = rpc_call_sync(mnt_clnt, &msg, 0);
        rpc_shutdown_client(mnt_clnt);
        if (status < 0)
                goto out_call_err;
-        if (result.status != 0)
+        if (result.errno != 0)
                goto out_mnt_err;
        dprintk("NFS: MNT request succeeded\n");
@@ -86,72 +196,215 @@ out:
 out_clnt_err:
        status = PTR_ERR(mnt_clnt);
-        dprintk("NFS: failed to create RPC client, status=%d\n", status);
+        dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
        goto out;
 out_call_err:
-        dprintk("NFS: failed to start MNT request, status=%d\n", status);
+        dprintk("NFS: MNT request failed, status=%d\n", status);
        goto out;
 out_mnt_err:
-        dprintk("NFS: MNT server returned result %d\n", result.status);
+        dprintk("NFS: MNT server returned result %d\n", result.errno);
-        status = nfs_stat_to_errno(result.status);
+        status = result.errno;
        goto out;
 }
 /*
 * XDR encode/decode functions for MOUNT
 */
-static int xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p,
-                              const char *path)
+static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+{
+        const u32 pathname_len = strlen(pathname);
+        __be32 *p;
+        if (unlikely(pathname_len > MNTPATHLEN))
+                return -EIO;
+        p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque(p, pathname, pathname_len);
+        return 0;
+}
+static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
+                           const char *dirpath)
+{
+        struct xdr_stream xdr;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        return encode_mntdirpath(&xdr, dirpath);
+}
+/*
+ * RFC 1094: "A non-zero status indicates some sort of error.  In this
+ * case, the status is a UNIX error number."  This can be problematic
+ * if the server and client use different errno values for the same
+ * error.
+ *
+ * However, the OpenGroup XNFS spec provides a simple mapping that is
+ * independent of local errno values on the server and the client.
+ */
+static int decode_status(struct xdr_stream *xdr, struct mountres *res)
 {
-        p = xdr_encode_string(p, path);
+        unsigned int i;
+        u32 status;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, sizeof(status));
+        if (unlikely(p == NULL))
+                return -EIO;
+        status = ntohl(*p);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
+                if (mnt_errtbl[i].status == status) {
+                        res->errno = mnt_errtbl[i].errno;
+                        return 0;
+                }
+        }
+        dprintk("NFS: unrecognized MNT status code: %u\n", status);
+        res->errno = -EACCES;
        return 0;
 }
-static int xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p,
+static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
-                               struct mnt_fhstatus *res)
 {
        struct nfs_fh *fh = res->fh;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+        if (unlikely(p == NULL))
+                return -EIO;
+        fh->size = NFS2_FHSIZE;
+        memcpy(fh->data, p, NFS2_FHSIZE);
+        return 0;
+}
+static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
+                            struct mountres *res)
+{
+        struct xdr_stream xdr;
+        int status;
+        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_status(&xdr, res);
+        if (unlikely(status != 0 || res->errno != 0))
+                return status;
+        return decode_fhandle(&xdr, res);
+}
+static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
+{
+        unsigned int i;
+        u32 status;
+        __be32 *p;
-        if ((res->status = ntohl(*p++)) == 0) {
+        p = xdr_inline_decode(xdr, sizeof(status));
-                fh->size = NFS2_FHSIZE;
+        if (unlikely(p == NULL))
-                memcpy(fh->data, p, NFS2_FHSIZE);
+                return -EIO;
+        status = ntohl(*p);
+        for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
+                if (mnt3_errtbl[i].status == status) {
+                        res->errno = mnt3_errtbl[i].errno;
+                        return 0;
+                }
        }
+        dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
+        res->errno = -EACCES;
        return 0;
 }
-static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
+static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
-                                struct mnt_fhstatus *res)
 {
        struct nfs_fh *fh = res->fh;
-        unsigned size;
+        u32 size;
+        __be32 *p;
-        if ((res->status = ntohl(*p++)) == 0) {
-                size = ntohl(*p++);
+        p = xdr_inline_decode(xdr, sizeof(size));
-                if (size <= NFS3_FHSIZE && size != 0) {
+        if (unlikely(p == NULL))
-                        fh->size = size;
+                return -EIO;
-                        memcpy(fh->data, p, size);
-                } else
+        size = ntohl(*p++);
-                        res->status = -EBADHANDLE;
+        if (size > NFS3_FHSIZE || size == 0)
+                return -EIO;
+        p = xdr_inline_decode(xdr, size);
+        if (unlikely(p == NULL))
+                return -EIO;
+        fh->size = size;
+        memcpy(fh->data, p, size);
+        return 0;
+}
+static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
+{
+        rpc_authflavor_t *flavors = res->auth_flavors;
+        unsigned int *count = res->auth_count;
+        u32 entries, i;
+        __be32 *p;
+        if (*count == 0)
+                return 0;
+        p = xdr_inline_decode(xdr, sizeof(entries));
+        if (unlikely(p == NULL))
+                return -EIO;
+        entries = ntohl(*p);
+        dprintk("NFS: received %u auth flavors\n", entries);
+        if (entries > NFS_MAX_SECFLAVORS)
+                entries = NFS_MAX_SECFLAVORS;
+        p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+        if (unlikely(p == NULL))
+                return -EIO;
+        if (entries > *count)
+                entries = *count;
+        for (i = 0; i < entries; i++) {
+                flavors[i] = ntohl(*p++);
+                dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
        }
+        *count = i;
        return 0;
 }
-#define MNT_dirpath_sz          (1 + 256)
+static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
-#define MNT_fhstatus_sz         (1 + 8)
+                             struct mountres *res)
-#define MNT_fhstatus3_sz        (1 + 16)
+{
+        struct xdr_stream xdr;
+        int status;
+        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_fhs_status(&xdr, res);
+        if (unlikely(status != 0 || res->errno != 0))
+                return status;
+        status = decode_fhandle3(&xdr, res);
+        if (unlikely(status != 0)) {
+                res->errno = -EBADHANDLE;
+                return 0;
+        }
+        return decode_auth_flavors(&xdr, res);
+}
 static struct rpc_procinfo mnt_procedures[] = {
-        [MNTPROC_MNT] = {
+        [MOUNTPROC_MNT] = {
-                .p_proc         = MNTPROC_MNT,
+                .p_proc         = MOUNTPROC_MNT,
-                .p_encode       = (kxdrproc_t) xdr_encode_dirpath,
+                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
-                .p_decode       = (kxdrproc_t) xdr_decode_fhstatus,
+                .p_decode       = (kxdrproc_t)mnt_dec_mountres,
-                .p_arglen       = MNT_dirpath_sz,
+                .p_arglen       = MNT_enc_dirpath_sz,
-                .p_replen       = MNT_fhstatus_sz,
+                .p_replen       = MNT_dec_mountres_sz,
-                .p_statidx      = MNTPROC_MNT,
+                .p_statidx      = MOUNTPROC_MNT,
                .p_name         = "MOUNT",
        },
 };
@@ -159,10 +412,10 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
        [MOUNTPROC3_MNT] = {
                .p_proc         = MOUNTPROC3_MNT,
-                .p_encode       = (kxdrproc_t) xdr_encode_dirpath,
+                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
-                .p_decode       = (kxdrproc_t) xdr_decode_fhstatus3,
+                .p_decode       = (kxdrproc_t)mnt_dec_mountres3,
-                .p_arglen       = MNT_dirpath_sz,
+                .p_arglen       = MNT_enc_dirpath_sz,
-                .p_replen       = MNT_fhstatus3_sz,
+                .p_replen       = MNT_dec_mountres3_sz,
                .p_statidx      = MOUNTPROC3_MNT,
                .p_name         = "MOUNT",
        },
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f01caec84463..40c766782891 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -65,6 +65,11 @@ char *nfs_path(const char *base,
                dentry = dentry->d_parent;
        }
        spin_unlock(&dcache_lock);
+        if (*end != '/') {
+                if (--buflen < 0)
+                        goto Elong;
+                *--end = '/';
+        }
        namelen = strlen(base);
        /* Strip off excess slashes in base string */
        while (namelen > 0 && base[namelen - 1] == '/')
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 6bbf0e6daad2..bac60515a4b3 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -207,8 +207,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        status = nfs_revalidate_inode(server, inode);
        if (status < 0)
                return ERR_PTR(status);
-        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
-                nfs_zap_acl_cache(inode);
        acl = nfs3_get_cached_acl(inode, type);
        if (acl != ERR_PTR(-EAGAIN))
                return acl;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 84345deab26f..6ea07a3c75d4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
+        NFS4CLNT_SESSION_SETUP,
 };
 /*
@@ -177,6 +178,14 @@ struct nfs4_state_recovery_ops {
        int state_flag_bit;
        int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
+        int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
+        struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
+};
+struct nfs4_state_maintenance_ops {
+        int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *);
+        struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
+        int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
 };
 extern const struct dentry_operations nfs4_dentry_operations;
@@ -193,6 +202,7 @@ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struc
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -200,8 +210,32 @@ extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fh
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
-extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
+extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
-extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
+extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
+#if defined(CONFIG_NFS_V4_1)
+extern int nfs4_setup_sequence(struct nfs_client *clp,
+                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+                int cache_reply, struct rpc_task *task);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern int nfs4_proc_create_session(struct nfs_client *, int reset);
+extern int nfs4_proc_destroy_session(struct nfs4_session *);
+extern int nfs4_init_session(struct nfs_server *server);
+#else /* CONFIG_NFS_v4_1 */
+static inline int nfs4_setup_sequence(struct nfs_client *clp,
+                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+                int cache_reply, struct rpc_task *task)
+{
+        return 0;
+}
+static inline int nfs4_init_session(struct nfs_server *server)
+{
+        return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -216,7 +250,12 @@ extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
 /* nfs4state.c */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+#if defined(CONFIG_NFS_V4_1)
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4674f8092da8..6917311f201c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -45,14 +45,16 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
-#include <linux/smp_lock.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "callback.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
@@ -247,7 +249,25 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                        ret = nfs4_wait_clnt_recover(clp);
                        if (ret == 0)
                                exception->retry = 1;
+#if !defined(CONFIG_NFS_V4_1)
                        break;
+#else /* !defined(CONFIG_NFS_V4_1) */
+                        if (!nfs4_has_session(server->nfs_client))
+                                break;
+                        /* FALLTHROUGH */
+                case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_DEADSESSION:
+                case -NFS4ERR_SEQ_FALSE_RETRY:
+                case -NFS4ERR_SEQ_MISORDERED:
+                        dprintk("%s ERROR: %d Reset session\n", __func__,
+                                errorcode);
+                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+                        exception->retry = 1;
+                        /* FALLTHROUGH */
+#endif /* !defined(CONFIG_NFS_V4_1) */
                case -NFS4ERR_FILE_OPEN:
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
@@ -271,6 +291,353 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
        spin_unlock(&clp->cl_lock);
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to -1.
+ */
+static void
+nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+{
+        int slotid = free_slotid;
+        spin_lock(&tbl->slot_tbl_lock);
+        /* clear used bit in bitmap */
+        __clear_bit(slotid, tbl->used_slots);
+        /* update highest_used_slotid when it is freed */
+        if (slotid == tbl->highest_used_slotid) {
+                slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
+                if (slotid >= 0 && slotid < tbl->max_slots)
+                        tbl->highest_used_slotid = slotid;
+                else
+                        tbl->highest_used_slotid = -1;
+        }
+        rpc_wake_up_next(&tbl->slot_tbl_waitq);
+        spin_unlock(&tbl->slot_tbl_lock);
+        dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
+                free_slotid, tbl->highest_used_slotid);
+}
+void nfs41_sequence_free_slot(const struct nfs_client *clp,
+                              struct nfs4_sequence_res *res)
+{
+        struct nfs4_slot_table *tbl;
+        if (!nfs4_has_session(clp)) {
+                dprintk("%s: No session\n", __func__);
+                return;
+        }
+        tbl = &clp->cl_session->fc_slot_table;
+        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
+                dprintk("%s: No slot\n", __func__);
+                /* just wake up the next guy waiting since
+                 * we may have not consumed a slot after all */
+                rpc_wake_up_next(&tbl->slot_tbl_waitq);
+                return;
+        }
+        nfs4_free_slot(tbl, res->sr_slotid);
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+}
+static void nfs41_sequence_done(struct nfs_client *clp,
+                                struct nfs4_sequence_res *res,
+                                int rpc_status)
+{
+        unsigned long timestamp;
+        struct nfs4_slot_table *tbl;
+        struct nfs4_slot *slot;
+        /*
+         * sr_status remains 1 if an RPC level error occurred. The server
+         * may or may not have processed the sequence operation..
+         * Proceed as if the server received and processed the sequence
+         * operation.
+         */
+        if (res->sr_status == 1)
+                res->sr_status = NFS_OK;
+        /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
+        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
+                goto out;
+        tbl = &clp->cl_session->fc_slot_table;
+        slot = tbl->slots + res->sr_slotid;
+        if (res->sr_status == 0) {
+                /* Update the slot's sequence and clientid lease timer */
+                ++slot->seq_nr;
+                timestamp = res->sr_renewal_time;
+                spin_lock(&clp->cl_lock);
+                if (time_before(clp->cl_last_renewal, timestamp))
+                        clp->cl_last_renewal = timestamp;
+                spin_unlock(&clp->cl_lock);
+                return;
+        }
+out:
+        /* The session may be reset by one of the error handlers. */
+        dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
+        nfs41_sequence_free_slot(clp, res);
+}
+/*
+ * nfs4_find_slot - efficiently look for a free slot
+ *
+ * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+static u8
+nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
+{
+        int slotid;
+        u8 ret_id = NFS4_MAX_SLOT_TABLE;
+        BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
+        dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
+                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+                tbl->max_slots);
+        slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
+        if (slotid >= tbl->max_slots)
+                goto out;
+        __set_bit(slotid, tbl->used_slots);
+        if (slotid > tbl->highest_used_slotid)
+                tbl->highest_used_slotid = slotid;
+        ret_id = slotid;
+out:
+        dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
+                __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
+        return ret_id;
+}
+static int nfs4_recover_session(struct nfs4_session *session)
+{
+        struct nfs_client *clp = session->clp;
+        int ret;
+        for (;;) {
+                ret = nfs4_wait_clnt_recover(clp);
+                if (ret != 0)
+                                return ret;
+                if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
+                        break;
+                nfs4_schedule_state_manager(clp);
+        }
+        return 0;
+}
+static int nfs41_setup_sequence(struct nfs4_session *session,
+                                struct nfs4_sequence_args *args,
+                                struct nfs4_sequence_res *res,
+                                int cache_reply,
+                                struct rpc_task *task)
+{
+        struct nfs4_slot *slot;
+        struct nfs4_slot_table *tbl;
+        int status = 0;
+        u8 slotid;
+        dprintk("--> %s\n", __func__);
+        /* slot already allocated? */
+        if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
+                return 0;
+        memset(res, 0, sizeof(*res));
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        tbl = &session->fc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) {
+                if (tbl->highest_used_slotid != -1) {
+                        rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                        spin_unlock(&tbl->slot_tbl_lock);
+                        dprintk("<-- %s: Session reset: draining\n", __func__);
+                        return -EAGAIN;
+                }
+                /* The slot table is empty; start the reset thread */
+                dprintk("%s Session Reset\n", __func__);
+                spin_unlock(&tbl->slot_tbl_lock);
+                status = nfs4_recover_session(session);
+                if (status)
+                        return status;
+                spin_lock(&tbl->slot_tbl_lock);
+        }
+        slotid = nfs4_find_slot(tbl, task);
+        if (slotid == NFS4_MAX_SLOT_TABLE) {
+                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                spin_unlock(&tbl->slot_tbl_lock);
+                dprintk("<-- %s: no free slots\n", __func__);
+                return -EAGAIN;
+        }
+        spin_unlock(&tbl->slot_tbl_lock);
+        slot = tbl->slots + slotid;
+        args->sa_session = session;
+        args->sa_slotid = slotid;
+        args->sa_cache_this = cache_reply;
+        dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
+        res->sr_session = session;
+        res->sr_slotid = slotid;
+        res->sr_renewal_time = jiffies;
+        /*
+         * sr_status is only set in decode_sequence, and so will remain
+         * set to 1 if an rpc level failure occurs.
+         */
+        res->sr_status = 1;
+        return 0;
+}
+int nfs4_setup_sequence(struct nfs_client *clp,
+                        struct nfs4_sequence_args *args,
+                        struct nfs4_sequence_res *res,
+                        int cache_reply,
+                        struct rpc_task *task)
+{
+        int ret = 0;
+        dprintk("--> %s clp %p session %p sr_slotid %d\n",
+                __func__, clp, clp->cl_session, res->sr_slotid);
+        if (!nfs4_has_session(clp))
+                goto out;
+        ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
+                                   task);
+        if (ret != -EAGAIN) {
+                /* terminate rpc task */
+                task->tk_status = ret;
+                task->tk_action = NULL;
+        }
+out:
+        dprintk("<-- %s status=%d\n", __func__, ret);
+        return ret;
+}
+struct nfs41_call_sync_data {
+        struct nfs_client *clp;
+        struct nfs4_sequence_args *seq_args;
+        struct nfs4_sequence_res *seq_res;
+        int cache_reply;
+};
+static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs41_call_sync_data *data = calldata;
+        dprintk("--> %s data->clp->cl_session %p\n", __func__,
+                data->clp->cl_session);
+        if (nfs4_setup_sequence(data->clp, data->seq_args,
+                                data->seq_res, data->cache_reply, task))
+                return;
+        rpc_call_start(task);
+}
+static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs41_call_sync_data *data = calldata;
+        nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
+        nfs41_sequence_free_slot(data->clp, data->seq_res);
+}
+struct rpc_call_ops nfs41_call_sync_ops = {
+        .rpc_call_prepare = nfs41_call_sync_prepare,
+        .rpc_call_done = nfs41_call_sync_done,
+};
+static int nfs4_call_sync_sequence(struct nfs_client *clp,
+                                   struct rpc_clnt *clnt,
+                                   struct rpc_message *msg,
+                                   struct nfs4_sequence_args *args,
+                                   struct nfs4_sequence_res *res,
+                                   int cache_reply)
+{
+        int ret;
+        struct rpc_task *task;
+        struct nfs41_call_sync_data data = {
+                .clp = clp,
+                .seq_args = args,
+                .seq_res = res,
+                .cache_reply = cache_reply,
+        };
+        struct rpc_task_setup task_setup = {
+                .rpc_client = clnt,
+                .rpc_message = msg,
+                .callback_ops = &nfs41_call_sync_ops,
+                .callback_data = &data
+        };
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        task = rpc_run_task(&task_setup);
+        if (IS_ERR(task))
+                ret = PTR_ERR(task);
+        else {
+                ret = task->tk_status;
+                rpc_put_task(task);
+        }
+        return ret;
+}
+int _nfs4_call_sync_session(struct nfs_server *server,
+                            struct rpc_message *msg,
+                            struct nfs4_sequence_args *args,
+                            struct nfs4_sequence_res *res,
+                            int cache_reply)
+{
+        return nfs4_call_sync_sequence(server->nfs_client, server->client,
+                                       msg, args, res, cache_reply);
+}
+#endif /* CONFIG_NFS_V4_1 */
+int _nfs4_call_sync(struct nfs_server *server,
+                    struct rpc_message *msg,
+                    struct nfs4_sequence_args *args,
+                    struct nfs4_sequence_res *res,
+                    int cache_reply)
+{
+        args->sa_session = res->sr_session = NULL;
+        return rpc_call_sync(server->client, msg, 0);
+}
+#define nfs4_call_sync(server, msg, args, res, cache_reply) \
+        (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+                        &(res)->seq_res, (cache_reply))
+static void nfs4_sequence_done(const struct nfs_server *server,
+                               struct nfs4_sequence_res *res, int rpc_status)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(server->nfs_client))
+                nfs41_sequence_done(server->nfs_client, res, rpc_status);
+#endif /* CONFIG_NFS_V4_1 */
+}
+/* no restart, therefore free slot here */
+static void nfs4_sequence_done_free_slot(const struct nfs_server *server,
+                                         struct nfs4_sequence_res *res,
+                                         int rpc_status)
+{
+        nfs4_sequence_done(server, res, rpc_status);
+        nfs4_sequence_free_slot(server->nfs_client, res);
+}
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
        struct nfs_inode *nfsi = NFS_I(dir);
@@ -312,6 +679,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
        p->o_res.server = p->o_arg.server;
        nfs_fattr_init(&p->f_attr);
        nfs_fattr_init(&p->dir_attr);
+        p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -804,16 +1172,30 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                err = _nfs4_open_delegation_recall(ctx, state, stateid);
                switch (err) {
                        case 0:
-                                return err;
+                        case -ENOENT:
+                        case -ESTALE:
+                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
                                nfs4_schedule_state_recovery(server->nfs_client);
-                                return err;
+                                goto out;
+                        case -ERESTARTSYS:
+                                /*
+                                 * The show must go on: exit, but mark the
+                                 * stateid as needing recovery.
+                                 */
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_BAD_STATEID:
+                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                        case -ENOMEM:
+                                err = 0;
+                                goto out;
                }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
+out:
        return err;
 }
@@ -929,6 +1311,10 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
                nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
        }
        data->timestamp = jiffies;
+        if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
+                                &data->o_arg.seq_args,
+                                &data->o_res.seq_res, 1, task))
+                return;
        rpc_call_start(task);
        return;
 out_no_action:
@@ -941,6 +1327,10 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
        struct nfs4_opendata *data = calldata;
        data->rpc_status = task->tk_status;
+        nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res,
+                                     task->tk_status);
        if (RPC_ASSASSINATED(task))
                return;
        if (task->tk_status == 0) {
@@ -1269,7 +1659,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        } else
                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (status == 0 && state != NULL)
                renew_lease(server, timestamp);
        return status;
@@ -1318,6 +1708,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        struct nfs4_state *state = calldata->state;
        struct nfs_server *server = NFS_SERVER(calldata->inode);
+        nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status);
        if (RPC_ASSASSINATED(task))
                return;
        /* hmm. we are done with the inode, and in the process of freeing
@@ -1336,10 +1727,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                                break;
                default:
                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
-                                rpc_restart_call(task);
+                                nfs4_restart_rpc(task, server->nfs_client);
                                return;
                        }
        }
+        nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 }
@@ -1380,6 +1772,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                calldata->arg.fmode = FMODE_WRITE;
        }
        calldata->timestamp = jiffies;
+        if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
+                                &calldata->arg.seq_args, &calldata->res.seq_res,
+                                1, task))
+                return;
        rpc_call_start(task);
 }
@@ -1419,13 +1815,15 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        };
        int status = -ENOMEM;
-        calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
        if (calldata == NULL)
                goto out;
        calldata->inode = state->inode;
        calldata->state = state;
        calldata->arg.fh = NFS_FH(state->inode);
        calldata->arg.stateid = &state->open_stateid;
+        if (nfs4_has_session(server->nfs_client))
+                memset(calldata->arg.stateid->data, 0, 4);    /* clear seqid */
        /* Serialization for the sequence id */
        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
        if (calldata->arg.seqid == NULL)
@@ -1435,6 +1833,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
+        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        calldata->path.mnt = mntget(path->mnt);
        calldata->path.dentry = dget(path->dentry);
@@ -1584,15 +1983,18 @@ void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
+        struct nfs4_server_caps_arg args = {
+                .fhandle = fhandle,
+        };
        struct nfs4_server_caps_res res = {};
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
-                .rpc_argp = fhandle,
+                .rpc_argp = &args,
                .rpc_resp = &res,
        };
        int status;
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (status == 0) {
                memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
                if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
@@ -1606,6 +2008,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
                server->acl_bitmask = res.acl_bitmask;
        }
        return status;
 }
@@ -1637,8 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
        nfs_fattr_init(info->fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -1728,7 +2132,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
        };
        
        nfs_fattr_init(fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -1812,7 +2216,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
        nfs_fattr_init(fattr);
        dprintk("NFS call  lookupfh %s\n", name->name);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        dprintk("NFS reply lookupfh: %d\n", status);
        return status;
 }
@@ -1898,7 +2302,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                        args.access |= NFS4_ACCESS_EXECUTE;
        }
        nfs_fattr_init(&fattr);
-        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (!status) {
                entry->mask = 0;
                if (res.access & NFS4_ACCESS_READ)
@@ -1957,13 +2361,14 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
                .pglen    = pglen,
                .pages    = &page,
        };
+        struct nfs4_readlink_res res;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
                .rpc_argp = &args,
-                .rpc_resp = NULL,
+                .rpc_resp = &res,
        };
-        return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
 }
 static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2057,7 +2462,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
        int                     status;
        nfs_fattr_init(&res.dir_attr);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 1);
        if (status == 0) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, &res.dir_attr);
@@ -2092,8 +2497,10 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        struct nfs_removeres *res = task->tk_msg.rpc_resp;
+        nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
+        nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res);
        update_changeattr(dir, &res->cinfo);
        nfs_post_op_update_inode(dir, &res->dir_attr);
        return 1;
@@ -2125,7 +2532,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
        
        nfs_fattr_init(res.old_fattr);
        nfs_fattr_init(res.new_fattr);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
@@ -2174,7 +2581,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        nfs_fattr_init(res.fattr);
        nfs_fattr_init(res.dir_attr);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2235,7 +2642,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
 {
-        int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+        int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg,
+                                    &data->arg, &data->res, 1);
        if (status == 0) {
                update_changeattr(dir, &data->res.dir_cinfo);
                nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2344,7 +2752,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                        (unsigned long long)cookie);
        nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
        res.pgbase = args.pgbase;
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
        if (status == 0)
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
@@ -2422,14 +2830,17 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
                .fh = fhandle,
                .bitmask = server->attr_bitmask,
        };
+        struct nfs4_statfs_res res = {
+                .fsstat = fsstat,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
                .rpc_argp = &args,
-                .rpc_resp = fsstat,
+                .rpc_resp = &res,
        };
        nfs_fattr_init(fsstat->fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return  nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -2451,13 +2862,16 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
                .fh = fhandle,
                .bitmask = server->attr_bitmask,
        };
+        struct nfs4_fsinfo_res res = {
+                .fsinfo = fsinfo,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
                .rpc_argp = &args,
-                .rpc_resp = fsinfo,
+                .rpc_resp = &res,
        };
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -2486,10 +2900,13 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
                .fh = fhandle,
                .bitmask = server->attr_bitmask,
        };
+        struct nfs4_pathconf_res res = {
+                .pathconf = pathconf,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
                .rpc_argp = &args,
-                .rpc_resp = pathconf,
+                .rpc_resp = &res,
        };
        /* None of the pathconf attributes are mandatory to implement */
@@ -2499,7 +2916,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
        }
        nfs_fattr_init(pathconf->fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2520,8 +2937,13 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
+        dprintk("--> %s\n", __func__);
+        /* nfs4_sequence_free_slot called in the read rpc_call_done */
+        nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, server->nfs_client);
                return -EAGAIN;
        }
@@ -2541,8 +2963,12 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
+        /* slot is freed in nfs_writeback_done */
+        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+                           task->tk_status);
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
        }
        if (task->tk_status >= 0) {
@@ -2567,10 +2993,14 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
+        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+                           task->tk_status);
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
        }
+        nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client,
+                                &data->res.seq_res);
        nfs_refresh_inode(inode, data->res.fattr);
        return 0;
 }
@@ -2603,6 +3033,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
        if (time_before(clp->cl_last_renewal,timestamp))
                clp->cl_last_renewal = timestamp;
        spin_unlock(&clp->cl_lock);
+        dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__,
+                                task->tk_msg.rpc_cred);
+        put_rpccred(task->tk_msg.rpc_cred);
 }
 static const struct rpc_call_ops nfs4_renew_ops = {
@@ -2742,12 +3175,14 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
                .acl_pages = pages,
                .acl_len = buflen,
        };
-        size_t resp_len = buflen;
+        struct nfs_getaclres res = {
+                .acl_len = buflen,
+        };
        void *resp_buf;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
                .rpc_argp = &args,
-                .rpc_resp = &resp_len,
+                .rpc_resp = &res,
        };
        struct page *localpage = NULL;
        int ret;
@@ -2761,26 +3196,26 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
                        return -ENOMEM;
                args.acl_pages[0] = localpage;
                args.acl_pgbase = 0;
-                resp_len = args.acl_len = PAGE_SIZE;
+                args.acl_len = PAGE_SIZE;
        } else {
                resp_buf = buf;
                buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
        }
-        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
        if (ret)
                goto out_free;
-        if (resp_len > args.acl_len)
+        if (res.acl_len > args.acl_len)
-                nfs4_write_cached_acl(inode, NULL, resp_len);
+                nfs4_write_cached_acl(inode, NULL, res.acl_len);
        else
-                nfs4_write_cached_acl(inode, resp_buf, resp_len);
+                nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
        if (buf) {
                ret = -ERANGE;
-                if (resp_len > buflen)
+                if (res.acl_len > buflen)
                        goto out_free;
                if (localpage)
-                        memcpy(buf, resp_buf, resp_len);
+                        memcpy(buf, resp_buf, res.acl_len);
        }
-        ret = resp_len;
+        ret = res.acl_len;
 out_free:
        if (localpage)
                __free_page(localpage);
@@ -2810,8 +3245,6 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
-        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
-                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -2827,10 +3260,11 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                .acl_pages      = pages,
                .acl_len        = buflen,
        };
+        struct nfs_setaclres res;
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
                .rpc_argp       = &arg,
-                .rpc_resp       = NULL,
+                .rpc_resp       = &res,
        };
        int ret;
@@ -2838,7 +3272,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                return -EOPNOTSUPP;
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
-        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
        return ret;
@@ -2857,10 +3291,8 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state)
 {
-        struct nfs_client *clp = server->nfs_client;
        if (!clp || task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
@@ -2879,8 +3311,23 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
                        task->tk_status = 0;
                        return -EAGAIN;
+#if defined(CONFIG_NFS_V4_1)
+                case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_DEADSESSION:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_SEQ_FALSE_RETRY:
+                case -NFS4ERR_SEQ_MISORDERED:
+                        dprintk("%s ERROR %d, Reset session\n", __func__,
+                                task->tk_status);
+                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+                        task->tk_status = 0;
+                        return -EAGAIN;
+#endif /* CONFIG_NFS_V4_1 */
                case -NFS4ERR_DELAY:
-                        nfs_inc_server_stats(server, NFSIOS_DELAY);
+                        if (server)
+                                nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
                        task->tk_status = 0;
@@ -2893,6 +3340,12 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        return 0;
 }
+static int
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+{
+        return _nfs4_async_handle_error(task, server, server->nfs_client, state);
+}
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
        nfs4_verifier sc_verifier;
@@ -3000,6 +3453,10 @@ struct nfs4_delegreturndata {
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
+        nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res,
+                                     task->tk_status);
        data->rpc_status = task->tk_status;
        if (data->rpc_status == 0)
                renew_lease(data->res.server, data->timestamp);
@@ -3010,7 +3467,25 @@ static void nfs4_delegreturn_release(void *calldata)
        kfree(calldata);
 }
+#if defined(CONFIG_NFS_V4_1)
+static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs4_delegreturndata *d_data;
+        d_data = (struct nfs4_delegreturndata *)data;
+        if (nfs4_setup_sequence(d_data->res.server->nfs_client,
+                                &d_data->args.seq_args,
+                                &d_data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs4_delegreturn_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs4_delegreturn_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs4_delegreturn_done,
        .rpc_release = nfs4_delegreturn_release,
 };
@@ -3032,7 +3507,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        };
        int status = 0;
-        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
                return -ENOMEM;
        data->args.fhandle = &data->fh;
@@ -3042,6 +3517,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        memcpy(&data->stateid, stateid, sizeof(data->stateid));
        data->res.fattr = &data->fattr;
        data->res.server = server;
+        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
@@ -3127,7 +3603,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        switch (status) {
                case 0:
                        request->fl_type = F_UNLCK;
@@ -3187,13 +3663,14 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        struct nfs4_unlockdata *p;
        struct inode *inode = lsp->ls_state->inode;
-        p = kmalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
        p->arg.seqid = seqid;
        p->res.seqid = seqid;
+        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->arg.stateid = &lsp->ls_stateid;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
@@ -3217,6 +3694,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
        struct nfs4_unlockdata *calldata = data;
+        nfs4_sequence_done(calldata->server, &calldata->res.seq_res,
+                           task->tk_status);
        if (RPC_ASSASSINATED(task))
                return;
        switch (task->tk_status) {
@@ -3233,8 +3712,11 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                        break;
                default:
                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-                                rpc_restart_call(task);
+                                nfs4_restart_rpc(task,
+                                                calldata->server->nfs_client);
        }
+        nfs4_sequence_free_slot(calldata->server->nfs_client,
+                                &calldata->res.seq_res);
 }
 static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -3249,6 +3731,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                return;
        }
        calldata->timestamp = jiffies;
+        if (nfs4_setup_sequence(calldata->server->nfs_client,
+                                &calldata->arg.seq_args,
+                                &calldata->res.seq_res, 1, task))
+                return;
        rpc_call_start(task);
 }
@@ -3341,6 +3827,7 @@ struct nfs4_lockdata {
        unsigned long timestamp;
        int rpc_status;
        int cancelled;
+        struct nfs_server *server;
 };
 static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
@@ -3366,7 +3853,9 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
        p->res.lock_seqid = p->arg.lock_seqid;
+        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->lsp = lsp;
+        p->server = server;
        atomic_inc(&lsp->ls_count);
        p->ctx = get_nfs_open_context(ctx);
        memcpy(&p->fl, fl, sizeof(p->fl));
@@ -3396,6 +3885,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        } else
                data->arg.new_lock_owner = 0;
        data->timestamp = jiffies;
+        if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
        rpc_call_start(task);
        dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
@@ -3406,6 +3898,9 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
        dprintk("%s: begin!\n", __func__);
+        nfs4_sequence_done_free_slot(data->server, &data->res.seq_res,
+                                     task->tk_status);
        data->rpc_status = task->tk_status;
        if (RPC_ASSASSINATED(task))
                goto out;
@@ -3487,8 +3982,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
        ret = nfs4_wait_for_completion_rpc_task(task);
        if (ret == 0) {
                ret = data->rpc_status;
-                if (ret == -NFS4ERR_DENIED)
-                        ret = -EAGAIN;
        } else
                data->cancelled = 1;
        rpc_put_task(task);
@@ -3576,9 +4069,11 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
        int err;
        do {
+                err = _nfs4_proc_setlk(state, cmd, request);
+                if (err == -NFS4ERR_DENIED)
+                        err = -EAGAIN;
                err = nfs4_handle_exception(NFS_SERVER(state->inode),
-                                _nfs4_proc_setlk(state, cmd, request),
+                                err, &exception);
-                                &exception);
        } while (exception.retry);
        return err;
 }
@@ -3598,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
        if (request->fl_start < 0 || request->fl_end < 0)
                return -EINVAL;
-        if (IS_GETLK(cmd))
+        if (IS_GETLK(cmd)) {
-                return nfs4_proc_getlk(state, F_GETLK, request);
+                if (state != NULL)
+                        return nfs4_proc_getlk(state, F_GETLK, request);
+                return 0;
+        }
        if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
                return -EINVAL;
-        if (request->fl_type == F_UNLCK)
+        if (request->fl_type == F_UNLCK) {
-                return nfs4_proc_unlck(state, cmd, request);
+                if (state != NULL)
+                        return nfs4_proc_unlck(state, cmd, request);
+                return 0;
+        }
+        if (state == NULL)
+                return -ENOLCK;
        do {
                status = nfs4_proc_setlk(state, cmd, request);
                if ((status != -EAGAIN) || IS_SETLK(cmd))
@@ -3630,8 +4133,37 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                goto out;
        do {
                err = _nfs4_do_setlk(state, F_SETLK, fl, 0);
-                if (err != -NFS4ERR_DELAY)
+                switch (err) {
-                        break;
+                        default:
+                                printk(KERN_ERR "%s: unhandled error %d.\n",
+                                                __func__, err);
+                        case 0:
+                        case -ESTALE:
+                                goto out;
+                        case -NFS4ERR_EXPIRED:
+                        case -NFS4ERR_STALE_CLIENTID:
+                        case -NFS4ERR_STALE_STATEID:
+                                nfs4_schedule_state_recovery(server->nfs_client);
+                                goto out;
+                        case -ERESTARTSYS:
+                                /*
+                                 * The show must go on: exit, but mark the
+                                 * stateid as needing recovery.
+                                 */
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_BAD_STATEID:
+                        case -NFS4ERR_OPENMODE:
+                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                err = 0;
+                                goto out;
+                        case -ENOMEM:
+                        case -NFS4ERR_DENIED:
+                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
+                                err = 0;
+                                goto out;
+                        case -NFS4ERR_DELAY:
+                                break;
+                }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
 out:
@@ -3706,10 +4238,13 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                .page = page,
                .bitmask = bitmask,
        };
+        struct nfs4_fs_locations_res res = {
+                .fs_locations = fs_locations,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
                .rpc_argp = &args,
-                .rpc_resp = fs_locations,
+                .rpc_resp = &res,
        };
        int status;
@@ -3717,24 +4252,736 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
        nfs_fattr_init(&fs_locations->fattr);
        fs_locations->server = server;
        fs_locations->nlocations = 0;
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        nfs_fixup_referral_attributes(&fs_locations->fattr);
        dprintk("%s: returned status = %d\n", __func__, status);
        return status;
 }
-struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
+#ifdef CONFIG_NFS_V4_1
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ */
+static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        nfs4_verifier verifier;
+        struct nfs41_exchange_id_args args = {
+                .client = clp,
+                .flags = clp->cl_exchange_flags,
+        };
+        struct nfs41_exchange_id_res res = {
+                .client = clp,
+        };
+        int status;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+                .rpc_cred = cred,
+        };
+        __be32 *p;
+        dprintk("--> %s\n", __func__);
+        BUG_ON(clp == NULL);
+        p = (u32 *)verifier.data;
+        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
+        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
+        args.verifier = &verifier;
+        while (1) {
+                args.id_len = scnprintf(args.id, sizeof(args.id),
+                                        "%s/%s %u",
+                                        clp->cl_ipaddr,
+                                        rpc_peeraddr2str(clp->cl_rpcclient,
+                                                         RPC_DISPLAY_ADDR),
+                                        clp->cl_id_uniquifier);
+                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+                if (status != NFS4ERR_CLID_INUSE)
+                        break;
+                if (signalled())
+                        break;
+                if (++clp->cl_id_uniquifier == 0)
+                        break;
+        }
+        dprintk("<-- %s status= %d\n", __func__, status);
+        return status;
+}
+struct nfs4_get_lease_time_data {
+        struct nfs4_get_lease_time_args *args;
+        struct nfs4_get_lease_time_res *res;
+        struct nfs_client *clp;
+};
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+                                        void *calldata)
+{
+        int ret;
+        struct nfs4_get_lease_time_data *data =
+                        (struct nfs4_get_lease_time_data *)calldata;
+        dprintk("--> %s\n", __func__);
+        /* just setup sequence, do not trigger session recovery
+           since we're invoked within one */
+        ret = nfs41_setup_sequence(data->clp->cl_session,
+                                        &data->args->la_seq_args,
+                                        &data->res->lr_seq_res, 0, task);
+        BUG_ON(ret == -EAGAIN);
+        rpc_call_start(task);
+        dprintk("<-- %s\n", __func__);
+}
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_get_lease_time_data *data =
+                        (struct nfs4_get_lease_time_data *)calldata;
+        dprintk("--> %s\n", __func__);
+        nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
+        switch (task->tk_status) {
+        case -NFS4ERR_DELAY:
+        case -NFS4ERR_GRACE:
+                dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
+                rpc_delay(task, NFS4_POLL_RETRY_MIN);
+                task->tk_status = 0;
+                nfs4_restart_rpc(task, data->clp);
+                return;
+        }
+        nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
+        dprintk("<-- %s\n", __func__);
+}
+struct rpc_call_ops nfs4_get_lease_time_ops = {
+        .rpc_call_prepare = nfs4_get_lease_time_prepare,
+        .rpc_call_done = nfs4_get_lease_time_done,
+};
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+        struct rpc_task *task;
+        struct nfs4_get_lease_time_args args;
+        struct nfs4_get_lease_time_res res = {
+                .lr_fsinfo = fsinfo,
+        };
+        struct nfs4_get_lease_time_data data = {
+                .args = &args,
+                .res = &res,
+                .clp = clp,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        struct rpc_task_setup task_setup = {
+                .rpc_client = clp->cl_rpcclient,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_get_lease_time_ops,
+                .callback_data = &data
+        };
+        int status;
+        res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+        dprintk("--> %s\n", __func__);
+        task = rpc_run_task(&task_setup);
+        if (IS_ERR(task))
+                status = PTR_ERR(task);
+        else {
+                status = task->tk_status;
+                rpc_put_task(task);
+        }
+        dprintk("<-- %s return %d\n", __func__, status);
+        return status;
+}
+/*
+ * Reset a slot table
+ */
+static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots,
+                int old_max_slots, int ivalue)
+{
+        int i;
+        int ret = 0;
+        dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl);
+        /*
+         * Until we have dynamic slot table adjustment, insist
+         * upon the same slot table size
+         */
+        if (max_slots != old_max_slots) {
+                dprintk("%s reset slot table does't match old\n",
+                        __func__);
+                ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */
+                goto out;
+        }
+        spin_lock(&tbl->slot_tbl_lock);
+        for (i = 0; i < max_slots; ++i)
+                tbl->slots[i].seq_nr = ivalue;
+        tbl->highest_used_slotid = -1;
+        spin_unlock(&tbl->slot_tbl_lock);
+        dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+                tbl, tbl->slots, tbl->max_slots);
+out:
+        dprintk("<-- %s: return %d\n", __func__, ret);
+        return ret;
+}
+/*
+ * Reset the forechannel and backchannel slot tables
+ */
+static int nfs4_reset_slot_tables(struct nfs4_session *session)
+{
+        int status;
+        status = nfs4_reset_slot_table(&session->fc_slot_table,
+                        session->fc_attrs.max_reqs,
+                        session->fc_slot_table.max_slots,
+                        1);
+        if (status)
+                return status;
+        status = nfs4_reset_slot_table(&session->bc_slot_table,
+                        session->bc_attrs.max_reqs,
+                        session->bc_slot_table.max_slots,
+                        0);
+        return status;
+}
+/* Destroy the slot table */
+static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+{
+        if (session->fc_slot_table.slots != NULL) {
+                kfree(session->fc_slot_table.slots);
+                session->fc_slot_table.slots = NULL;
+        }
+        if (session->bc_slot_table.slots != NULL) {
+                kfree(session->bc_slot_table.slots);
+                session->bc_slot_table.slots = NULL;
+        }
+        return;
+}
+/*
+ * Initialize slot table
+ */
+static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
+                int max_slots, int ivalue)
+{
+        int i;
+        struct nfs4_slot *slot;
+        int ret = -ENOMEM;
+        BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
+        dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
+        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+        if (!slot)
+                goto out;
+        for (i = 0; i < max_slots; ++i)
+                slot[i].seq_nr = ivalue;
+        ret = 0;
+        spin_lock(&tbl->slot_tbl_lock);
+        if (tbl->slots != NULL) {
+                spin_unlock(&tbl->slot_tbl_lock);
+                dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
+                        __func__, tbl, tbl->slots);
+                WARN_ON(1);
+                goto out_free;
+        }
+        tbl->max_slots = max_slots;
+        tbl->slots = slot;
+        tbl->highest_used_slotid = -1;  /* no slot is currently used */
+        spin_unlock(&tbl->slot_tbl_lock);
+        dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+                tbl, tbl->slots, tbl->max_slots);
+out:
+        dprintk("<-- %s: return %d\n", __func__, ret);
+        return ret;
+out_free:
+        kfree(slot);
+        goto out;
+}
+/*
+ * Initialize the forechannel and backchannel tables
+ */
+static int nfs4_init_slot_tables(struct nfs4_session *session)
+{
+        int status;
+        status = nfs4_init_slot_table(&session->fc_slot_table,
+                        session->fc_attrs.max_reqs, 1);
+        if (status)
+                return status;
+        status = nfs4_init_slot_table(&session->bc_slot_table,
+                        session->bc_attrs.max_reqs, 0);
+        if (status)
+                nfs4_destroy_slot_tables(session);
+        return status;
+}
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session;
+        struct nfs4_slot_table *tbl;
+        session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+        if (!session)
+                return NULL;
+        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+        /*
+         * The create session reply races with the server back
+         * channel probe. Mark the client NFS_CS_SESSION_INITING
+         * so that the client back channel can find the
+         * nfs_client struct
+         */
+        clp->cl_cons_state = NFS_CS_SESSION_INITING;
+        tbl = &session->fc_slot_table;
+        spin_lock_init(&tbl->slot_tbl_lock);
+        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        tbl = &session->bc_slot_table;
+        spin_lock_init(&tbl->slot_tbl_lock);
+        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        session->clp = clp;
+        return session;
+}
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+        nfs4_proc_destroy_session(session);
+        dprintk("%s Destroy backchannel for xprt %p\n",
+                __func__, session->clp->cl_rpcclient->cl_xprt);
+        xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
+                                NFS41_BC_MIN_CALLBACKS);
+        nfs4_destroy_slot_tables(session);
+        kfree(session);
+}
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+{
+        struct nfs4_session *session = args->client->cl_session;
+        unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
+                     mxresp_sz = session->fc_attrs.max_resp_sz;
+        if (mxrqst_sz == 0)
+                mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
+        if (mxresp_sz == 0)
+                mxresp_sz = NFS_MAX_FILE_IO_SIZE;
+        /* Fore channel attributes */
+        args->fc_attrs.headerpadsz = 0;
+        args->fc_attrs.max_rqst_sz = mxrqst_sz;
+        args->fc_attrs.max_resp_sz = mxresp_sz;
+        args->fc_attrs.max_resp_sz_cached = mxresp_sz;
+        args->fc_attrs.max_ops = NFS4_MAX_OPS;
+        args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+        dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+                __func__,
+                args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+                args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
+                args->fc_attrs.max_reqs);
+        /* Back channel attributes */
+        args->bc_attrs.headerpadsz = 0;
+        args->bc_attrs.max_rqst_sz = PAGE_SIZE;
+        args->bc_attrs.max_resp_sz = PAGE_SIZE;
+        args->bc_attrs.max_resp_sz_cached = 0;
+        args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+        args->bc_attrs.max_reqs = 1;
+        dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+                __func__,
+                args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+                args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+                args->bc_attrs.max_reqs);
+}
+static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+{
+        if (rcvd <= sent)
+                return 0;
+        printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
+                "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
+        return -EINVAL;
+}
+#define _verify_fore_channel_attr(_name_) \
+        _verify_channel_attr("fore", #_name_, \
+                             args->fc_attrs._name_, \
+                             session->fc_attrs._name_)
+#define _verify_back_channel_attr(_name_) \
+        _verify_channel_attr("back", #_name_, \
+                             args->bc_attrs._name_, \
+                             session->bc_attrs._name_)
+/*
+ * The server is not allowed to increase the fore channel header pad size,
+ * maximum response size, or maximum number of operations.
+ *
+ * The back channel attributes are only negotiatied down: We send what the
+ * (back channel) server insists upon.
+ */
+static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
+                                     struct nfs4_session *session)
+{
+        int ret = 0;
+        ret |= _verify_fore_channel_attr(headerpadsz);
+        ret |= _verify_fore_channel_attr(max_resp_sz);
+        ret |= _verify_fore_channel_attr(max_ops);
+        ret |= _verify_back_channel_attr(headerpadsz);
+        ret |= _verify_back_channel_attr(max_rqst_sz);
+        ret |= _verify_back_channel_attr(max_resp_sz);
+        ret |= _verify_back_channel_attr(max_resp_sz_cached);
+        ret |= _verify_back_channel_attr(max_ops);
+        ret |= _verify_back_channel_attr(max_reqs);
+        return ret;
+}
+static int _nfs4_proc_create_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session = clp->cl_session;
+        struct nfs41_create_session_args args = {
+                .client = clp,
+                .cb_program = NFS4_CALLBACK,
+        };
+        struct nfs41_create_session_res res = {
+                .client = clp,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        int status;
+        nfs4_init_channel_attrs(&args);
+        args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
+        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+        if (!status)
+                /* Verify the session's negotiated channel_attrs values */
+                status = nfs4_verify_channel_attrs(&args, session);
+        if (!status) {
+                /* Increment the clientid slot sequence id */
+                clp->cl_seqid++;
+        }
+        return status;
+}
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp, int reset)
+{
+        int status;
+        unsigned *ptr;
+        struct nfs_fsinfo fsinfo;
+        struct nfs4_session *session = clp->cl_session;
+        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+        status = _nfs4_proc_create_session(clp);
+        if (status)
+                goto out;
+        /* Init or reset the fore channel */
+        if (reset)
+                status = nfs4_reset_slot_tables(session);
+        else
+                status = nfs4_init_slot_tables(session);
+        dprintk("fore channel slot table initialization returned %d\n", status);
+        if (status)
+                goto out;
+        ptr = (unsigned *)&session->sess_id.data[0];
+        dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+                clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+        if (reset)
+                /* Lease time is aleady set */
+                goto out;
+        /* Get the lease time */
+        status = nfs4_proc_get_lease_time(clp, &fsinfo);
+        if (status == 0) {
+                /* Update lease time and schedule renewal */
+                spin_lock(&clp->cl_lock);
+                clp->cl_lease_time = fsinfo.lease_time * HZ;
+                clp->cl_last_renewal = jiffies;
+                clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                spin_unlock(&clp->cl_lock);
+                nfs4_schedule_state_renewal(clp);
+        }
+out:
+        dprintk("<-- %s\n", __func__);
+        return status;
+}
+/*
+ * Issue the over-the-wire RPC DESTROY_SESSION.
+ * The caller must serialize access to this routine.
+ */
+int nfs4_proc_destroy_session(struct nfs4_session *session)
+{
+        int status = 0;
+        struct rpc_message msg;
+        dprintk("--> nfs4_proc_destroy_session\n");
+        /* session is still being setup */
+        if (session->clp->cl_cons_state != NFS_CS_READY)
+                return status;
+        msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION];
+        msg.rpc_argp = session;
+        msg.rpc_resp = NULL;
+        msg.rpc_cred = NULL;
+        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+        if (status)
+                printk(KERN_WARNING
+                        "Got error %d from the server on DESTROY_SESSION. "
+                        "Session has been destroyed regardless...\n", status);
+        dprintk("<-- nfs4_proc_destroy_session\n");
+        return status;
+}
+int nfs4_init_session(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        int ret;
+        if (!nfs4_has_session(clp))
+                return 0;
+        clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
+        clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
+        ret = nfs4_recover_expired_lease(server);
+        if (!ret)
+                ret = nfs4_check_client_ready(clp);
+        return ret;
+}
+/*
+ * Renew the cl_session lease.
+ */
+static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        struct nfs4_sequence_args args;
+        struct nfs4_sequence_res res;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+                .rpc_cred = cred,
+        };
+        args.sa_cache_this = 0;
+        return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
+                                       &res, 0);
+}
+void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_client *clp = (struct nfs_client *)data;
+        nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status);
+        if (task->tk_status < 0) {
+                dprintk("%s ERROR %d\n", __func__, task->tk_status);
+                if (_nfs4_async_handle_error(task, NULL, clp, NULL)
+                                                                == -EAGAIN) {
+                        nfs4_restart_rpc(task, clp);
+                        return;
+                }
+        }
+        nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
+        dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
+        put_rpccred(task->tk_msg.rpc_cred);
+        kfree(task->tk_msg.rpc_argp);
+        kfree(task->tk_msg.rpc_resp);
+        dprintk("<-- %s\n", __func__);
+}
+static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs_client *clp;
+        struct nfs4_sequence_args *args;
+        struct nfs4_sequence_res *res;
+        clp = (struct nfs_client *)data;
+        args = task->tk_msg.rpc_argp;
+        res = task->tk_msg.rpc_resp;
+        if (nfs4_setup_sequence(clp, args, res, 0, task))
+                return;
+        rpc_call_start(task);
+}
+static const struct rpc_call_ops nfs41_sequence_ops = {
+        .rpc_call_done = nfs41_sequence_call_done,
+        .rpc_call_prepare = nfs41_sequence_prepare,
+};
+static int nfs41_proc_async_sequence(struct nfs_client *clp,
+                                     struct rpc_cred *cred)
+{
+        struct nfs4_sequence_args *args;
+        struct nfs4_sequence_res *res;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+                .rpc_cred = cred,
+        };
+        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return -ENOMEM;
+        res = kzalloc(sizeof(*res), GFP_KERNEL);
+        if (!res) {
+                kfree(args);
+                return -ENOMEM;
+        }
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        msg.rpc_argp = args;
+        msg.rpc_resp = res;
+        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
+                              &nfs41_sequence_ops, (void *)clp);
+}
+#endif /* CONFIG_NFS_V4_1 */
+struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
+        .establish_clid = nfs4_init_clientid,
+        .get_clid_cred  = nfs4_get_setclientid_cred,
+};
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
+        .recover_open   = nfs4_open_reclaim,
+        .recover_lock   = nfs4_lock_reclaim,
+        .establish_clid = nfs4_proc_exchange_id,
+        .get_clid_cred  = nfs4_get_exchange_id_cred,
+};
+#endif /* CONFIG_NFS_V4_1 */
+struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
+        .recover_open   = nfs4_open_expired,
+        .recover_lock   = nfs4_lock_expired,
+        .establish_clid = nfs4_init_clientid,
+        .get_clid_cred  = nfs4_get_setclientid_cred,
 };
-struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = {
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs4_open_expired,
        .recover_lock   = nfs4_lock_expired,
+        .establish_clid = nfs4_proc_exchange_id,
+        .get_clid_cred  = nfs4_get_exchange_id_cred,
+};
+#endif /* CONFIG_NFS_V4_1 */
+struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+        .sched_state_renewal = nfs4_proc_async_renew,
+        .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
+        .renew_lease = nfs4_proc_renew,
+};
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+        .sched_state_renewal = nfs41_proc_async_sequence,
+        .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
+        .renew_lease = nfs4_proc_sequence,
+};
+#endif
+/*
+ * Per minor version reboot and network partition recovery ops
+ */
+struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = {
+        &nfs40_reboot_recovery_ops,
+#if defined(CONFIG_NFS_V4_1)
+        &nfs41_reboot_recovery_ops,
+#endif
+};
+struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
+        &nfs40_nograce_recovery_ops,
+#if defined(CONFIG_NFS_V4_1)
+        &nfs41_nograce_recovery_ops,
+#endif
+};
+struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
+        &nfs40_state_renewal_ops,
+#if defined(CONFIG_NFS_V4_1)
+        &nfs41_state_renewal_ops,
+#endif
 };
 static const struct inode_operations nfs4_file_inode_operations = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index f524e932ff7b..e27c6cef18f2 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -59,12 +59,14 @@
 void
 nfs4_renew_state(struct work_struct *work)
 {
+        struct nfs4_state_maintenance_ops *ops;
        struct nfs_client *clp =
                container_of(work, struct nfs_client, cl_renewd.work);
        struct rpc_cred *cred;
        long lease, timeout;
        unsigned long last, now;
+        ops = nfs4_state_renewal_ops[clp->cl_minorversion];
        dprintk("%s: start\n", __func__);
        /* Are there any active superblocks? */
        if (list_empty(&clp->cl_superblocks))
@@ -76,7 +78,7 @@ nfs4_renew_state(struct work_struct *work)
        timeout = (2 * lease) / 3 + (long)last - (long)now;
        /* Are we close to a lease timeout? */
        if (time_after(now, last + lease/3)) {
-                cred = nfs4_get_renew_cred_locked(clp);
+                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
                        if (list_empty(&clp->cl_delegations)) {
@@ -86,7 +88,7 @@ nfs4_renew_state(struct work_struct *work)
                        nfs_expire_all_delegations(clp);
                } else {
                        /* Queue an asynchronous RENEW. */
-                        nfs4_proc_async_renew(clp, cred);
+                        ops->sched_state_renewal(clp, cred);
                        put_rpccred(cred);
                }
                timeout = (2 * lease) / 3;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0298e909559f..65ca8c18476f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -60,7 +60,7 @@ const nfs4_stateid zero_stateid;
 static LIST_HEAD(nfs4_clientid_list);
-static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        unsigned short port;
        int status;
@@ -77,7 +77,7 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
        return status;
 }
-static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
 {
        struct rpc_cred *cred = NULL;
@@ -114,17 +114,21 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        return cred;
 }
-static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+#if defined(CONFIG_NFS_V4_1)
+struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
        spin_lock(&clp->cl_lock);
-        cred = nfs4_get_renew_cred_locked(clp);
+        cred = nfs4_get_machine_cred_locked(clp);
        spin_unlock(&clp->cl_lock);
        return cred;
 }
-static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+#endif /* CONFIG_NFS_V4_1 */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
@@ -549,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        INIT_LIST_HEAD(&lsp->ls_sequence.list);
        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
+        lsp->ls_state = state;
        lsp->ls_owner = fl_owner;
        spin_lock(&clp->cl_lock);
        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
@@ -583,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                if (lsp != NULL)
                        break;
                if (new != NULL) {
-                        new->ls_state = state;
                        list_add(&new->ls_locks, &state->lock_states);
                        set_bit(LK_STATE_IN_USE, &state->flags);
                        lsp = new;
@@ -738,12 +742,14 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 {
-        if (status == -NFS4ERR_BAD_SEQID) {
+        struct nfs4_state_owner *sp = container_of(seqid->sequence,
-                struct nfs4_state_owner *sp = container_of(seqid->sequence,
+                                        struct nfs4_state_owner, so_seqid);
-                                struct nfs4_state_owner, so_seqid);
+        struct nfs_server *server = sp->so_server;
+        if (status == -NFS4ERR_BAD_SEQID)
                nfs4_drop_state_owner(sp);
-        }
+        if (!nfs4_has_session(server->nfs_client))
-        nfs_increment_seqid(status, seqid);
+                nfs_increment_seqid(status, seqid);
 }
 /*
@@ -847,32 +853,45 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        struct file_lock *fl;
        int status = 0;
+        if (inode->i_flock == NULL)
+                return 0;
+        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
+        /* Protect inode->i_flock using the BKL */
+        lock_kernel();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
+                unlock_kernel();
                status = ops->recover_lock(state, fl);
-                if (status >= 0)
-                        continue;
                switch (status) {
+                        case 0:
+                                break;
+                        case -ESTALE:
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_STALE_STATEID:
+                        case -NFS4ERR_BAD_STATEID:
+                        case -NFS4ERR_EXPIRED:
+                        case -NFS4ERR_NO_GRACE:
+                        case -NFS4ERR_STALE_CLIENTID:
+                                goto out;
                        default:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
                                                __func__, status);
-                        case -NFS4ERR_EXPIRED:
+                        case -ENOMEM:
-                        case -NFS4ERR_NO_GRACE:
+                        case -NFS4ERR_DENIED:
                        case -NFS4ERR_RECLAIM_BAD:
                        case -NFS4ERR_RECLAIM_CONFLICT:
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
-                                break;
+                                status = 0;
-                        case -NFS4ERR_STALE_CLIENTID:
-                                goto out_err;
                }
+                lock_kernel();
        }
-        up_write(&nfsi->rwsem);
+        unlock_kernel();
-        return 0;
+out:
-out_err:
        up_write(&nfsi->rwsem);
        return status;
 }
@@ -918,6 +937,7 @@ restart:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
                                                __func__, status);
                        case -ENOENT:
+                        case -ENOMEM:
                        case -ESTALE:
                                /*
                                 * Open state on this file cannot be recovered
@@ -928,6 +948,9 @@ restart:
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_STALE_STATEID:
+                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_RECLAIM_BAD:
                        case -NFS4ERR_RECLAIM_CONFLICT:
                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
@@ -1042,6 +1065,14 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case -NFS4ERR_EXPIRED:
                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                        nfs4_state_start_reclaim_nograce(clp);
+                case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_DEADSESSION:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_SEQ_FALSE_RETRY:
+                case -NFS4ERR_SEQ_MISORDERED:
+                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
        }
 }
@@ -1075,18 +1106,22 @@ restart:
 static int nfs4_check_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
+        struct nfs4_state_maintenance_ops *ops =
+                nfs4_state_renewal_ops[clp->cl_minorversion];
        int status = -NFS4ERR_EXPIRED;
        /* Is the client already known to have an expired lease? */
        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
                return 0;
-        cred = nfs4_get_renew_cred(clp);
+        spin_lock(&clp->cl_lock);
+        cred = ops->get_state_renewal_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
        if (cred == NULL) {
                cred = nfs4_get_setclientid_cred(clp);
                if (cred == NULL)
                        goto out;
        }
-        status = nfs4_proc_renew(clp, cred);
+        status = ops->renew_lease(clp, cred);
        put_rpccred(cred);
 out:
        nfs4_recovery_handle_error(clp, status);
@@ -1096,21 +1131,98 @@ out:
 static int nfs4_reclaim_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
+        struct nfs4_state_recovery_ops *ops =
+                nfs4_reboot_recovery_ops[clp->cl_minorversion];
        int status = -ENOENT;
-        cred = nfs4_get_setclientid_cred(clp);
+        cred = ops->get_clid_cred(clp);
        if (cred != NULL) {
-                status = nfs4_init_client(clp, cred);
+                status = ops->establish_clid(clp, cred);
                put_rpccred(cred);
                /* Handle case where the user hasn't set up machine creds */
                if (status == -EACCES && cred == clp->cl_machine_cred) {
                        nfs4_clear_machine_cred(clp);
                        status = -EAGAIN;
                }
+                if (status == -NFS4ERR_MINOR_VERS_MISMATCH)
+                        status = -EPROTONOSUPPORT;
+        }
+        return status;
+}
+#ifdef CONFIG_NFS_V4_1
+static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err)
+{
+        switch (err) {
+        case -NFS4ERR_STALE_CLIENTID:
+                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+        }
+}
+static int nfs4_reset_session(struct nfs_client *clp)
+{
+        int status;
+        status = nfs4_proc_destroy_session(clp->cl_session);
+        if (status && status != -NFS4ERR_BADSESSION &&
+            status != -NFS4ERR_DEADSESSION) {
+                nfs4_session_recovery_handle_error(clp, status);
+                goto out;
        }
+        memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
+        status = nfs4_proc_create_session(clp, 1);
+        if (status)
+                nfs4_session_recovery_handle_error(clp, status);
+                /* fall through*/
+out:
+        /* Wake up the next rpc task even on error */
+        rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq);
        return status;
 }
+static int nfs4_initialize_session(struct nfs_client *clp)
+{
+        int status;
+        status = nfs4_proc_create_session(clp, 0);
+        if (!status) {
+                nfs_mark_client_ready(clp, NFS_CS_READY);
+        } else if (status == -NFS4ERR_STALE_CLIENTID) {
+                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+        } else {
+                nfs_mark_client_ready(clp, status);
+        }
+        return status;
+}
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+static int nfs4_initialize_session(struct nfs_client *clp) { return 0; }
+#endif /* CONFIG_NFS_V4_1 */
+/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
+ * on EXCHANGE_ID for v4.1
+ */
+static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
+{
+        if (nfs4_has_session(clp)) {
+                switch (status) {
+                case -NFS4ERR_DELAY:
+                case -NFS4ERR_CLID_INUSE:
+                case -EAGAIN:
+                        break;
+                case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+                                         * in nfs4_exchange_id */
+                default:
+                        return;
+                }
+        }
+        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+}
 static void nfs4_state_manager(struct nfs_client *clp)
 {
        int status = 0;
@@ -1121,9 +1233,12 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        /* We're going to have to re-establish a clientid */
                        status = nfs4_reclaim_lease(clp);
                        if (status) {
-                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                nfs4_set_lease_expired(clp, status);
                                if (status == -EAGAIN)
                                        continue;
+                                if (clp->cl_cons_state ==
+                                                        NFS_CS_SESSION_INITING)
+                                        nfs_mark_client_ready(clp, status);
                                goto out_error;
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
@@ -1134,25 +1249,44 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        if (status != 0)
                                continue;
                }
+                /* Initialize or reset the session */
+                if (nfs4_has_session(clp) &&
+                   test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+                        if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+                                status = nfs4_initialize_session(clp);
+                        else
+                                status = nfs4_reset_session(clp);
+                        if (status) {
+                                if (status == -NFS4ERR_STALE_CLIENTID)
+                                        continue;
+                                goto out_error;
+                        }
+                }
                /* First recover reboot state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
-                        status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
+                        status = nfs4_do_reclaim(clp,
+                                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
                        if (status == -NFS4ERR_STALE_CLIENTID)
                                continue;
+                        if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
+                                continue;
                        nfs4_state_end_reclaim_reboot(clp);
                        continue;
                }
                /* Now recover expired state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
-                        status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
+                        status = nfs4_do_reclaim(clp,
+                                nfs4_nograce_recovery_ops[clp->cl_minorversion]);
                        if (status < 0) {
                                set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
                                if (status == -NFS4ERR_STALE_CLIENTID)
                                        continue;
                                if (status == -NFS4ERR_EXPIRED)
                                        continue;
+                                if (test_bit(NFS4CLNT_SESSION_SETUP,
+                                                                &clp->cl_state))
+                                        continue;
                                goto out_error;
                        } else
                                nfs4_state_end_reclaim_nograce(clp);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1690f0e44b91..617273e7d47f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -192,12 +192,16 @@ static int nfs4_stat_to_errno(int);
                                 decode_verifier_maxsz)
 #define encode_remove_maxsz     (op_encode_hdr_maxsz + \
                                nfs4_name_maxsz)
+#define decode_remove_maxsz     (op_decode_hdr_maxsz + \
+                                 decode_change_info_maxsz)
 #define encode_rename_maxsz     (op_encode_hdr_maxsz + \
                                2 * nfs4_name_maxsz)
-#define decode_rename_maxsz     (op_decode_hdr_maxsz + 5 + 5)
+#define decode_rename_maxsz     (op_decode_hdr_maxsz + \
+                                 decode_change_info_maxsz + \
+                                 decode_change_info_maxsz)
 #define encode_link_maxsz       (op_encode_hdr_maxsz + \
                                nfs4_name_maxsz)
-#define decode_link_maxsz       (op_decode_hdr_maxsz + 5)
+#define decode_link_maxsz       (op_decode_hdr_maxsz + decode_change_info_maxsz)
 #define encode_lock_maxsz       (op_encode_hdr_maxsz + \
                                 7 + \
                                 1 + encode_stateid_maxsz + 8)
@@ -240,43 +244,115 @@ static int nfs4_stat_to_errno(int);
                                (encode_getattr_maxsz)
 #define decode_fs_locations_maxsz \
                                (0)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+                                encode_verifier_maxsz + \
+                                1 /* co_ownerid.len */ + \
+                                XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+                                1 /* flags */ + \
+                                1 /* spa_how */ + \
+                                0 /* SP4_NONE (for now) */ + \
+                                1 /* zero implemetation id array */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+                                2 /* eir_clientid */ + \
+                                1 /* eir_sequenceid */ + \
+                                1 /* eir_flags */ + \
+                                1 /* spr_how */ + \
+                                0 /* SP4_NONE (for now) */ + \
+                                2 /* eir_server_owner.so_minor_id */ + \
+                                /* eir_server_owner.so_major_id<> */ \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+                                /* eir_server_scope<> */ \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+                                1 /* eir_server_impl_id array length */ + \
+                                0 /* ignored eir_server_impl_id contents */)
+#define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz  (6 + \
+                                     1 /* ca_rdma_ird.len */ + \
+                                     1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz  (op_encode_hdr_maxsz + \
+                                     2 /* csa_clientid */ + \
+                                     1 /* csa_sequence */ + \
+                                     1 /* csa_flags */ + \
+                                     encode_channel_attrs_maxsz + \
+                                     encode_channel_attrs_maxsz + \
+                                     1 /* csa_cb_program */ + \
+                                     1 /* csa_sec_parms.len (1) */ + \
+                                     1 /* cb_secflavor (AUTH_SYS) */ + \
+                                     1 /* stamp */ + \
+                                     1 /* machinename.len */ + \
+                                     XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+                                     1 /* uid */ + \
+                                     1 /* gid */ + \
+                                     1 /* gids.len (0) */)
+#define decode_create_session_maxsz  (op_decode_hdr_maxsz +     \
+                                     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+                                     1 /* csr_sequence */ + \
+                                     1 /* csr_flags */ + \
+                                     decode_channel_attrs_maxsz + \
+                                     decode_channel_attrs_maxsz)
+#define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)
+#define decode_destroy_session_maxsz    (op_decode_hdr_maxsz)
+#define encode_sequence_maxsz   (op_encode_hdr_maxsz + \
+                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
+#define decode_sequence_maxsz   (op_decode_hdr_maxsz + \
+                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#else /* CONFIG_NFS_V4_1 */
+#define encode_sequence_maxsz   0
+#define decode_sequence_maxsz   0
+#endif /* CONFIG_NFS_V4_1 */
 #define NFS4_enc_compound_sz    (1024)  /* XXX: large enough? */
 #define NFS4_dec_compound_sz    (1024)  /* XXX: large enough? */
 #define NFS4_enc_read_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_read_maxsz)
 #define NFS4_dec_read_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_read_maxsz)
 #define NFS4_enc_readlink_sz    (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_readlink_maxsz)
 #define NFS4_dec_readlink_sz    (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_readlink_maxsz)
 #define NFS4_enc_readdir_sz     (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_readdir_maxsz)
 #define NFS4_dec_readdir_sz     (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_readdir_maxsz)
 #define NFS4_enc_write_sz       (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_write_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_write_sz       (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_write_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_commit_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_commit_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_commit_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_commit_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_open_maxsz + \
@@ -285,6 +361,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_open_maxsz + \
@@ -301,43 +378,53 @@ static int nfs4_stat_to_errno(int);
                                 decode_putfh_maxsz + \
                                 decode_open_confirm_maxsz)
 #define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \
+                                        encode_sequence_maxsz + \
                                        encode_putfh_maxsz + \
                                        encode_open_maxsz + \
                                        encode_getattr_maxsz)
 #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
+                                        decode_sequence_maxsz + \
                                        decode_putfh_maxsz + \
                                        decode_open_maxsz + \
                                        decode_getattr_maxsz)
 #define NFS4_enc_open_downgrade_sz \
                                (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_open_downgrade_maxsz + \
                                 encode_getattr_maxsz)
 #define NFS4_dec_open_downgrade_sz \
                                (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_open_downgrade_maxsz + \
                                 decode_getattr_maxsz)
 #define NFS4_enc_close_sz       (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_close_maxsz + \
                                 encode_getattr_maxsz)
 #define NFS4_dec_close_sz       (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_close_maxsz + \
                                 decode_getattr_maxsz)
 #define NFS4_enc_setattr_sz     (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_setattr_maxsz + \
                                 encode_getattr_maxsz)
 #define NFS4_dec_setattr_sz     (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_setattr_maxsz + \
                                 decode_getattr_maxsz)
 #define NFS4_enc_fsinfo_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_fsinfo_maxsz)
 #define NFS4_dec_fsinfo_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_fsinfo_maxsz)
 #define NFS4_enc_renew_sz       (compound_encode_hdr_maxsz + \
@@ -359,64 +446,81 @@ static int nfs4_stat_to_errno(int);
                                decode_putrootfh_maxsz + \
                                decode_fsinfo_maxsz)
 #define NFS4_enc_lock_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_lock_maxsz)
 #define NFS4_dec_lock_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_lock_maxsz)
 #define NFS4_enc_lockt_sz       (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_lockt_maxsz)
 #define NFS4_dec_lockt_sz       (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_lockt_maxsz)
 #define NFS4_enc_locku_sz       (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_locku_maxsz)
 #define NFS4_dec_locku_sz       (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_locku_maxsz)
 #define NFS4_enc_access_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_access_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_access_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_access_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_getattr_sz     (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_getattr_sz     (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_lookup_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_lookup_maxsz + \
                                encode_getattr_maxsz + \
                                encode_getfh_maxsz)
 #define NFS4_dec_lookup_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_lookup_maxsz + \
                                decode_getattr_maxsz + \
                                decode_getfh_maxsz)
 #define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putrootfh_maxsz + \
                                encode_getattr_maxsz + \
                                encode_getfh_maxsz)
 #define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putrootfh_maxsz + \
                                decode_getattr_maxsz + \
                                decode_getfh_maxsz)
 #define NFS4_enc_remove_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_remove_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_remove_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
-                                op_decode_hdr_maxsz + 5 + \
+                                decode_remove_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_rename_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_putfh_maxsz + \
@@ -425,6 +529,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_rename_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_putfh_maxsz + \
@@ -433,6 +538,7 @@ static int nfs4_stat_to_errno(int);
                                decode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_link_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_putfh_maxsz + \
@@ -441,6 +547,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_dec_link_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_putfh_maxsz + \
@@ -449,16 +556,19 @@ static int nfs4_stat_to_errno(int);
                                decode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_symlink_sz     (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_symlink_maxsz + \
                                encode_getattr_maxsz + \
                                encode_getfh_maxsz)
 #define NFS4_dec_symlink_sz     (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_symlink_maxsz + \
                                decode_getattr_maxsz + \
                                decode_getfh_maxsz)
 #define NFS4_enc_create_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_create_maxsz + \
@@ -467,6 +577,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_create_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_create_maxsz + \
@@ -475,52 +586,98 @@ static int nfs4_stat_to_errno(int);
                                decode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_pathconf_sz    (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_pathconf_sz    (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_statfs_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_statfs_maxsz)
 #define NFS4_dec_statfs_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_statfs_maxsz)
 #define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_delegreturn_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_delegreturn_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_getacl_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getacl_maxsz)
 #define NFS4_dec_getacl_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getacl_maxsz)
 #define NFS4_enc_setacl_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_setacl_maxsz)
 #define NFS4_dec_setacl_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_setacl_maxsz)
 #define NFS4_enc_fs_locations_sz \
                                (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_lookup_maxsz + \
                                 encode_fs_locations_maxsz)
 #define NFS4_dec_fs_locations_sz \
                                (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_lookup_maxsz + \
                                 decode_fs_locations_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_exchange_id_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_create_session_maxsz)
+#define NFS4_enc_destroy_session_sz     (compound_encode_hdr_maxsz + \
+                                         encode_destroy_session_maxsz)
+#define NFS4_dec_destroy_session_sz     (compound_decode_hdr_maxsz + \
+                                         decode_destroy_session_maxsz)
+#define NFS4_enc_sequence_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 encode_sequence_maxsz)
+#define NFS4_dec_sequence_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz)
+#define NFS4_enc_get_lease_time_sz      (compound_encode_hdr_maxsz + \
+                                         encode_sequence_maxsz + \
+                                         encode_putrootfh_maxsz + \
+                                         encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz      (compound_decode_hdr_maxsz + \
+                                         decode_sequence_maxsz + \
+                                         decode_putrootfh_maxsz + \
+                                         decode_fsinfo_maxsz)
+#endif /* CONFIG_NFS_V4_1 */
 static const umode_t nfs_type2fmt[] = {
        [NF4BAD] = 0,
@@ -541,6 +698,8 @@ struct compound_hdr {
        __be32 *        nops_p;
        uint32_t        taglen;
        char *          tag;
+        uint32_t        replen;         /* expected reply words */
+        u32             minorversion;
 };
 /*
@@ -576,22 +735,31 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
        xdr_encode_opaque(p, str, len);
 }
-static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+static void encode_compound_hdr(struct xdr_stream *xdr,
+                                struct rpc_rqst *req,
+                                struct compound_hdr *hdr)
 {
        __be32 *p;
+        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        /* initialize running count of expected bytes in reply.
+         * NOTE: the replied tag SHOULD be the same is the one sent,
+         * but this is not required as a MUST for the server to do so. */
+        hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
        dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
        BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
        RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
        WRITE32(hdr->taglen);
        WRITEMEM(hdr->tag, hdr->taglen);
-        WRITE32(NFS4_MINOR_VERSION);
+        WRITE32(hdr->minorversion);
        hdr->nops_p = p;
        WRITE32(hdr->nops);
 }
 static void encode_nops(struct compound_hdr *hdr)
 {
+        BUG_ON(hdr->nops > NFS4_MAX_OPS);
        *hdr->nops_p = htonl(hdr->nops);
 }
@@ -736,6 +904,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
        WRITE32(OP_ACCESS);
        WRITE32(access);
        hdr->nops++;
+        hdr->replen += decode_access_maxsz;
 }
 static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
@@ -747,6 +916,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
        WRITE32(arg->seqid->sequence->counter);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        hdr->nops++;
+        hdr->replen += decode_close_maxsz;
 }
 static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
@@ -758,6 +928,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
        WRITE64(args->offset);
        WRITE32(args->count);
        hdr->nops++;
+        hdr->replen += decode_commit_maxsz;
 }
 static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
@@ -789,6 +960,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
        WRITE32(create->name->len);
        WRITEMEM(create->name->name, create->name->len);
        hdr->nops++;
+        hdr->replen += decode_create_maxsz;
        encode_attrs(xdr, create->attrs, create->server);
 }
@@ -802,6 +974,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
        WRITE32(1);
        WRITE32(bitmap);
        hdr->nops++;
+        hdr->replen += decode_getattr_maxsz;
 }
 static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
@@ -814,6 +987,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
        WRITE32(bm0);
        WRITE32(bm1);
        hdr->nops++;
+        hdr->replen += decode_getattr_maxsz;
 }
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -841,6 +1015,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_GETFH);
        hdr->nops++;
+        hdr->replen += decode_getfh_maxsz;
 }
 static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -852,6 +1027,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
        hdr->nops++;
+        hdr->replen += decode_link_maxsz;
 }
 static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -899,6 +1075,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
                WRITE32(args->lock_seqid->sequence->counter);
        }
        hdr->nops++;
+        hdr->replen += decode_lock_maxsz;
 }
 static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
@@ -915,6 +1092,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
        WRITEMEM("lock id:", 8);
        WRITE64(args->lock_owner.id);
        hdr->nops++;
+        hdr->replen += decode_lockt_maxsz;
 }
 static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
@@ -929,6 +1107,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
        WRITE64(args->fl->fl_start);
        WRITE64(nfs4_lock_length(args->fl));
        hdr->nops++;
+        hdr->replen += decode_locku_maxsz;
 }
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -941,6 +1120,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
        WRITE32(len);
        WRITEMEM(name->name, len);
        hdr->nops++;
+        hdr->replen += decode_lookup_maxsz;
 }
 static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
@@ -1080,6 +1260,7 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
                BUG();
        }
        hdr->nops++;
+        hdr->replen += decode_open_maxsz;
 }
 static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
@@ -1091,6 +1272,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
        hdr->nops++;
+        hdr->replen += decode_open_confirm_maxsz;
 }
 static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
@@ -1103,6 +1285,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
        WRITE32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
        hdr->nops++;
+        hdr->replen += decode_open_downgrade_maxsz;
 }
 static void
@@ -1116,6 +1299,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
        WRITE32(len);
        WRITEMEM(fh->data, len);
        hdr->nops++;
+        hdr->replen += decode_putfh_maxsz;
 }
 static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
@@ -1125,6 +1309,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_PUTROOTFH);
        hdr->nops++;
+        hdr->replen += decode_putrootfh_maxsz;
 }
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,6 +1338,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
        WRITE64(args->offset);
        WRITE32(args->count);
        hdr->nops++;
+        hdr->replen += decode_read_maxsz;
 }
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1178,6 +1364,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
        WRITE32(attrs[0] & readdir->bitmask[0]);
        WRITE32(attrs[1] & readdir->bitmask[1]);
        hdr->nops++;
+        hdr->replen += decode_readdir_maxsz;
        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
@@ -1194,6 +1381,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
        RESERVE_SPACE(4);
        WRITE32(OP_READLINK);
        hdr->nops++;
+        hdr->replen += decode_readlink_maxsz;
 }
 static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -1205,6 +1393,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
        hdr->nops++;
+        hdr->replen += decode_remove_maxsz;
 }
 static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
@@ -1220,6 +1409,7 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
        WRITE32(newname->len);
        WRITEMEM(newname->name, newname->len);
        hdr->nops++;
+        hdr->replen += decode_rename_maxsz;
 }
 static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
@@ -1230,6 +1420,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
        WRITE32(OP_RENEW);
        WRITE64(client_stateid->cl_clientid);
        hdr->nops++;
+        hdr->replen += decode_renew_maxsz;
 }
 static void
@@ -1240,6 +1431,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_RESTOREFH);
        hdr->nops++;
+        hdr->replen += decode_restorefh_maxsz;
 }
 static int
@@ -1259,6 +1451,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        WRITE32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
+        hdr->replen += decode_setacl_maxsz;
        return 0;
 }
@@ -1270,6 +1463,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_SAVEFH);
        hdr->nops++;
+        hdr->replen += decode_savefh_maxsz;
 }
 static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
@@ -1280,6 +1474,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
        WRITE32(OP_SETATTR);
        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
        hdr->nops++;
+        hdr->replen += decode_setattr_maxsz;
        encode_attrs(xdr, arg->iap, server);
 }
@@ -1299,6 +1494,7 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        RESERVE_SPACE(4);
        WRITE32(setclientid->sc_cb_ident);
        hdr->nops++;
+        hdr->replen += decode_setclientid_maxsz;
 }
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
@@ -1310,6 +1506,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
        WRITE64(client_state->cl_clientid);
        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
        hdr->nops++;
+        hdr->replen += decode_setclientid_confirm_maxsz;
 }
 static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
@@ -1328,6 +1525,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
        hdr->nops++;
+        hdr->replen += decode_write_maxsz;
 }
 static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
@@ -1339,11 +1537,163 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
        WRITE32(OP_DELEGRETURN);
        WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
        hdr->nops++;
+        hdr->replen += decode_delegreturn_maxsz;
+}
+#if defined(CONFIG_NFS_V4_1)
+/* NFSv4.1 operations */
+static void encode_exchange_id(struct xdr_stream *xdr,
+                               struct nfs41_exchange_id_args *args,
+                               struct compound_hdr *hdr)
+{
+        __be32 *p;
+        RESERVE_SPACE(4 + sizeof(args->verifier->data));
+        WRITE32(OP_EXCHANGE_ID);
+        WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+        encode_string(xdr, args->id_len, args->id);
+        RESERVE_SPACE(12);
+        WRITE32(args->flags);
+        WRITE32(0);     /* zero length state_protect4_a */
+        WRITE32(0);     /* zero length implementation id array */
+        hdr->nops++;
+        hdr->replen += decode_exchange_id_maxsz;
+}
+static void encode_create_session(struct xdr_stream *xdr,
+                                  struct nfs41_create_session_args *args,
+                                  struct compound_hdr *hdr)
+{
+        __be32 *p;
+        char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
+        uint32_t len;
+        struct nfs_client *clp = args->client;
+        RESERVE_SPACE(4);
+        WRITE32(OP_CREATE_SESSION);
+        RESERVE_SPACE(8);
+        WRITE64(clp->cl_ex_clid);
+        RESERVE_SPACE(8);
+        WRITE32(clp->cl_seqid);                 /*Sequence id */
+        WRITE32(args->flags);                   /*flags */
+        RESERVE_SPACE(2*28);                    /* 2 channel_attrs */
+        /* Fore Channel */
+        WRITE32(args->fc_attrs.headerpadsz);    /* header padding size */
+        WRITE32(args->fc_attrs.max_rqst_sz);    /* max req size */
+        WRITE32(args->fc_attrs.max_resp_sz);    /* max resp size */
+        WRITE32(args->fc_attrs.max_resp_sz_cached);     /* Max resp sz cached */
+        WRITE32(args->fc_attrs.max_ops);        /* max operations */
+        WRITE32(args->fc_attrs.max_reqs);       /* max requests */
+        WRITE32(0);                             /* rdmachannel_attrs */
+        /* Back Channel */
+        WRITE32(args->fc_attrs.headerpadsz);    /* header padding size */
+        WRITE32(args->bc_attrs.max_rqst_sz);    /* max req size */
+        WRITE32(args->bc_attrs.max_resp_sz);    /* max resp size */
+        WRITE32(args->bc_attrs.max_resp_sz_cached);     /* Max resp sz cached */
+        WRITE32(args->bc_attrs.max_ops);        /* max operations */
+        WRITE32(args->bc_attrs.max_reqs);       /* max requests */
+        WRITE32(0);                             /* rdmachannel_attrs */
+        RESERVE_SPACE(4);
+        WRITE32(args->cb_program);              /* cb_program */
+        RESERVE_SPACE(4);                       /* # of security flavors */
+        WRITE32(1);
+        RESERVE_SPACE(4);
+        WRITE32(RPC_AUTH_UNIX);                 /* auth_sys */
+        /* authsys_parms rfc1831 */
+        RESERVE_SPACE(4);
+        WRITE32((u32)clp->cl_boot_time.tv_nsec);        /* stamp */
+        len = scnprintf(machine_name, sizeof(machine_name), "%s",
+                        clp->cl_ipaddr);
+        RESERVE_SPACE(16 + len);
+        WRITE32(len);
+        WRITEMEM(machine_name, len);
+        WRITE32(0);                             /* UID */
+        WRITE32(0);                             /* GID */
+        WRITE32(0);                             /* No more gids */
+        hdr->nops++;
+        hdr->replen += decode_create_session_maxsz;
+}
+static void encode_destroy_session(struct xdr_stream *xdr,
+                                   struct nfs4_session *session,
+                                   struct compound_hdr *hdr)
+{
+        __be32 *p;
+        RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
+        WRITE32(OP_DESTROY_SESSION);
+        WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+        hdr->nops++;
+        hdr->replen += decode_destroy_session_maxsz;
 }
+#endif /* CONFIG_NFS_V4_1 */
+static void encode_sequence(struct xdr_stream *xdr,
+                            const struct nfs4_sequence_args *args,
+                            struct compound_hdr *hdr)
+{
+#if defined(CONFIG_NFS_V4_1)
+        struct nfs4_session *session = args->sa_session;
+        struct nfs4_slot_table *tp;
+        struct nfs4_slot *slot;
+        __be32 *p;
+        if (!session)
+                return;
+        tp = &session->fc_slot_table;
+        WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
+        slot = tp->slots + args->sa_slotid;
+        RESERVE_SPACE(4);
+        WRITE32(OP_SEQUENCE);
+        /*
+         * Sessionid + seqid + slotid + max slotid + cache_this
+         */
+        dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d "
+                "max_slotid=%d cache_this=%d\n",
+                __func__,
+                ((u32 *)session->sess_id.data)[0],
+                ((u32 *)session->sess_id.data)[1],
+                ((u32 *)session->sess_id.data)[2],
+                ((u32 *)session->sess_id.data)[3],
+                slot->seq_nr, args->sa_slotid,
+                tp->highest_used_slotid, args->sa_cache_this);
+        RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
+        WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(slot->seq_nr);
+        WRITE32(args->sa_slotid);
+        WRITE32(tp->highest_used_slotid);
+        WRITE32(args->sa_cache_this);
+        hdr->nops++;
+        hdr->replen += decode_sequence_maxsz;
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
 */
+static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
+{
+#if defined(CONFIG_NFS_V4_1)
+        if (args->sa_session)
+                return args->sa_session->clp->cl_minorversion;
+#endif /* CONFIG_NFS_V4_1 */
+        return 0;
+}
 /*
 * Encode an ACCESS request
 */
@@ -1351,11 +1701,12 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_access(&xdr, args->access, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1370,11 +1721,12 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
        encode_lookup(&xdr, args->name, &hdr);
        encode_getfh(&xdr, &hdr);
@@ -1390,11 +1742,12 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putrootfh(&xdr, &hdr);
        encode_getfh(&xdr, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1409,11 +1762,12 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_remove(&xdr, &args->name, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1428,11 +1782,12 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->old_dir, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_putfh(&xdr, args->new_dir, &hdr);
@@ -1451,11 +1806,12 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
@@ -1474,11 +1830,12 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_create(&xdr, args, &hdr);
@@ -1505,11 +1862,12 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
@@ -1523,11 +1881,12 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_close(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1542,11 +1901,12 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_open(&xdr, args, &hdr);
@@ -1569,7 +1929,7 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_open_confirm(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1583,11 +1943,12 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_open(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1602,11 +1963,12 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_open_downgrade(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1621,11 +1983,12 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_lock(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1639,11 +2002,12 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_lockt(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1657,11 +2021,12 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_locku(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1675,22 +2040,16 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
-        unsigned int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_readlink(&xdr, args, req, &hdr);
-        /* set up reply kvec
+        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-         *    toplevel_status + taglen + rescount + OP_PUTFH + status
-         *      + OP_READLINK + status + string length = 8
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
                        args->pgbase, args->pglen);
        encode_nops(&hdr);
        return 0;
@@ -1703,25 +2062,19 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
-        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_readdir(&xdr, args, req, &hdr);
-        /* set up reply kvec
+        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-         *    toplevel_status + taglen + rescount + OP_PUTFH + status
-         *      + OP_READDIR + status + verifer(2)  = 9
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readdir_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
                         args->pgbase, args->count);
        dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
-                        __func__, replen, args->pages,
+                        __func__, hdr.replen << 2, args->pages,
                        args->pgbase, args->count);
        encode_nops(&hdr);
        return 0;
@@ -1732,24 +2085,18 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 */
 static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_read(&xdr, args, &hdr);
-        /* set up reply kvec
+        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-         *    toplevel status + taglen=0 + rescount + OP_PUTFH + status
-         *       + OP_READ + status + eof + datalen = 9
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
        encode_nops(&hdr);
@@ -1763,11 +2110,12 @@ static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_seta
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_setattr(&xdr, args, args->server, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1783,20 +2131,19 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
                struct nfs_getaclargs *args)
 {
        struct xdr_stream xdr;
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int replen;
+        uint32_t replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
+        replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1;
        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
-        /* set up reply buffer: */
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
                args->acl_pages, args->acl_pgbase, args->acl_len);
        encode_nops(&hdr);
        return 0;
@@ -1809,11 +2156,12 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_write(&xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
@@ -1829,11 +2177,12 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_commit(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1848,11 +2197,12 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_fsinfo(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
@@ -1866,11 +2216,12 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
                           &hdr);
@@ -1885,11 +2236,12 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
@@ -1900,16 +2252,18 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
+static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+                                    struct nfs4_server_caps_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
-        encode_putfh(&xdr, fhandle, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(&xdr, args->fhandle, &hdr);
        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
@@ -1929,7 +2283,7 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_renew(&xdr, clp, &hdr);
        encode_nops(&hdr);
        return 0;
@@ -1946,7 +2300,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_setclientid(&xdr, sc, &hdr);
        encode_nops(&hdr);
        return 0;
@@ -1964,7 +2318,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_setclientid_confirm(&xdr, clp, &hdr);
        encode_putrootfh(&xdr, &hdr);
        encode_fsinfo(&xdr, lease_bitmap, &hdr);
@@ -1979,11 +2333,12 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fhandle, &hdr);
        encode_delegreturn(&xdr, args->stateid, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1998,28 +2353,119 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        uint32_t replen;
-        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
        encode_lookup(&xdr, args->name, &hdr);
+        replen = hdr.replen;    /* get the attribute into args->page */
        encode_fs_locations(&xdr, args->bitmask, &hdr);
-        /* set up reply
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
-         *   toplevel_status + OP_PUTFH + status
-         *   + OP_LOOKUP + status + OP_GETATTR + status = 7
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
                        0, PAGE_SIZE);
        encode_nops(&hdr);
        return 0;
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+                                    struct nfs41_exchange_id_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = args->client->cl_minorversion,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_exchange_id(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+                                       struct nfs41_create_session_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = args->client->cl_minorversion,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_create_session(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a DESTROY_SESSION request
+ */
+static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
+                                        struct nfs4_session *session)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = session->clp->cl_minorversion,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_destroy_session(&xdr, session, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a SEQUENCE request
+ */
+static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
+                                 struct nfs4_sequence_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(args),
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+                                       struct nfs4_get_lease_time_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+        };
+        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->la_seq_args, &hdr);
+        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(&xdr, lease_bitmap, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * START OF "GENERIC" DECODE ROUTINES.
 *   These may look a little ugly since they are imported from a "generic"
@@ -3657,7 +4103,7 @@ decode_savefh(struct xdr_stream *xdr)
        return decode_op_hdr(xdr, OP_SAVEFH);
 }
-static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
+static int decode_setattr(struct xdr_stream *xdr)
 {
        __be32 *p;
        uint32_t bmlen;
@@ -3735,6 +4181,169 @@ static int decode_delegreturn(struct xdr_stream *xdr)
        return decode_op_hdr(xdr, OP_DELEGRETURN);
 }
+#if defined(CONFIG_NFS_V4_1)
+static int decode_exchange_id(struct xdr_stream *xdr,
+                              struct nfs41_exchange_id_res *res)
+{
+        __be32 *p;
+        uint32_t dummy;
+        int status;
+        struct nfs_client *clp = res->client;
+        status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+        if (status)
+                return status;
+        READ_BUF(8);
+        READ64(clp->cl_ex_clid);
+        READ_BUF(12);
+        READ32(clp->cl_seqid);
+        READ32(clp->cl_exchange_flags);
+        /* We ask for SP4_NONE */
+        READ32(dummy);
+        if (dummy != SP4_NONE)
+                return -EIO;
+        /* Throw away minor_id */
+        READ_BUF(8);
+        /* Throw away Major id */
+        READ_BUF(4);
+        READ32(dummy);
+        READ_BUF(dummy);
+        /* Throw away server_scope */
+        READ_BUF(4);
+        READ32(dummy);
+        READ_BUF(dummy);
+        /* Throw away Implementation id array */
+        READ_BUF(4);
+        READ32(dummy);
+        READ_BUF(dummy);
+        return 0;
+}
+static int decode_chan_attrs(struct xdr_stream *xdr,
+                             struct nfs4_channel_attrs *attrs)
+{
+        __be32 *p;
+        u32 nr_attrs;
+        READ_BUF(28);
+        READ32(attrs->headerpadsz);
+        READ32(attrs->max_rqst_sz);
+        READ32(attrs->max_resp_sz);
+        READ32(attrs->max_resp_sz_cached);
+        READ32(attrs->max_ops);
+        READ32(attrs->max_reqs);
+        READ32(nr_attrs);
+        if (unlikely(nr_attrs > 1)) {
+                printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
+                        __func__, nr_attrs);
+                return -EINVAL;
+        }
+        if (nr_attrs == 1)
+                READ_BUF(4); /* skip rdma_attrs */
+        return 0;
+}
+static int decode_create_session(struct xdr_stream *xdr,
+                                 struct nfs41_create_session_res *res)
+{
+        __be32 *p;
+        int status;
+        struct nfs_client *clp = res->client;
+        struct nfs4_session *session = clp->cl_session;
+        status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+        if (status)
+                return status;
+        /* sessionid */
+        READ_BUF(NFS4_MAX_SESSIONID_LEN);
+        COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
+        /* seqid, flags */
+        READ_BUF(8);
+        READ32(clp->cl_seqid);
+        READ32(session->flags);
+        /* Channel attributes */
+        status = decode_chan_attrs(xdr, &session->fc_attrs);
+        if (!status)
+                status = decode_chan_attrs(xdr, &session->bc_attrs);
+        return status;
+}
+static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
+{
+        return decode_op_hdr(xdr, OP_DESTROY_SESSION);
+}
+#endif /* CONFIG_NFS_V4_1 */
+static int decode_sequence(struct xdr_stream *xdr,
+                           struct nfs4_sequence_res *res,
+                           struct rpc_rqst *rqstp)
+{
+#if defined(CONFIG_NFS_V4_1)
+        struct nfs4_slot *slot;
+        struct nfs4_sessionid id;
+        u32 dummy;
+        int status;
+        __be32 *p;
+        if (!res->sr_session)
+                return 0;
+        status = decode_op_hdr(xdr, OP_SEQUENCE);
+        if (status)
+                goto out_err;
+        /*
+         * If the server returns different values for sessionID, slotID or
+         * sequence number, the server is looney tunes.
+         */
+        status = -ESERVERFAULT;
+        slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
+        COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
+        if (memcmp(id.data, res->sr_session->sess_id.data,
+                   NFS4_MAX_SESSIONID_LEN)) {
+                dprintk("%s Invalid session id\n", __func__);
+                goto out_err;
+        }
+        /* seqid */
+        READ32(dummy);
+        if (dummy != slot->seq_nr) {
+                dprintk("%s Invalid sequence number\n", __func__);
+                goto out_err;
+        }
+        /* slot id */
+        READ32(dummy);
+        if (dummy != res->sr_slotid) {
+                dprintk("%s Invalid slot id\n", __func__);
+                goto out_err;
+        }
+        /* highest slot id - currently not processed */
+        READ32(dummy);
+        /* target highest slot id - currently not processed */
+        READ32(dummy);
+        /* result flags - currently not processed */
+        READ32(dummy);
+        status = 0;
+out_err:
+        res->sr_status = status;
+        return status;
+#else  /* CONFIG_NFS_V4_1 */
+        return 0;
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * END OF "GENERIC" DECODE ROUTINES.
 */
@@ -3752,6 +4361,9 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -3773,7 +4385,11 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        status = decode_putfh(&xdr);
        if (status != 0)
@@ -3796,7 +4412,11 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3819,7 +4439,11 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putrootfh(&xdr)) != 0)
                goto out;
@@ -3839,7 +4463,11 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3860,7 +4488,11 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3890,7 +4522,11 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3923,7 +4559,11 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3963,6 +4603,9 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -3979,12 +4622,13 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        status = encode_setacl(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -3995,7 +4639,8 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+                    struct nfs_setaclres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4005,10 +4650,13 @@ nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr, res);
+        status = decode_setattr(&xdr);
 out:
        return status;
 }
@@ -4017,7 +4665,8 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+                    struct nfs_getaclres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4027,10 +4676,13 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
-        status = decode_getacl(&xdr, rqstp, acl_len);
+        status = decode_getacl(&xdr, rqstp, &res->acl_len);
 out:
        return status;
@@ -4049,6 +4701,9 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4079,6 +4734,9 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4133,6 +4791,9 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4157,10 +4818,13 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr, res);
+        status = decode_setattr(&xdr);
        if (status)
                goto out;
        decode_getfattr(&xdr, res->fattr, res->server);
@@ -4181,6 +4845,9 @@ static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4202,6 +4869,9 @@ static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4223,6 +4893,9 @@ static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4234,7 +4907,8 @@ out:
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+                                 struct nfs4_readlink_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4244,6 +4918,9 @@ static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4265,6 +4942,9 @@ static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_r
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4286,6 +4966,9 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readr
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4309,6 +4992,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4335,6 +5021,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4349,7 +5038,8 @@ out:
 /*
 * FSINFO request
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+                               struct nfs4_fsinfo_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4358,16 +5048,19 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
+                status = decode_sequence(&xdr, &res->seq_res, req);
+        if (!status)
                status = decode_putfh(&xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, fsinfo);
+                status = decode_fsinfo(&xdr, res->fsinfo);
        return status;
 }
 /*
 * PATHCONF request
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+                                 struct nfs4_pathconf_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4376,16 +5069,19 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pat
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
+                status = decode_sequence(&xdr, &res->seq_res, req);
+        if (!status)
                status = decode_putfh(&xdr);
        if (!status)
-                status = decode_pathconf(&xdr, pathconf);
+                status = decode_pathconf(&xdr, res->pathconf);
        return status;
 }
 /*
 * STATFS request
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+                               struct nfs4_statfs_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4394,9 +5090,11 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fssta
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
+                status = decode_sequence(&xdr, &res->seq_res, req);
+        if (!status)
                status = decode_putfh(&xdr);
        if (!status)
-                status = decode_statfs(&xdr, fsstat);
+                status = decode_statfs(&xdr, res->fsstat);
        return status;
 }
@@ -4410,7 +5108,11 @@ static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4
        int status;
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, req);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -4483,7 +5185,10 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
-        if (status != 0)
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        status = decode_putfh(&xdr);
        if (status != 0)
@@ -4497,7 +5202,8 @@ out:
 /*
 * FS_LOCATIONS request
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+                                     struct nfs4_fs_locations_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4505,18 +5211,113 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
-        if (status != 0)
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, req);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
        if ((status = decode_lookup(&xdr)) != 0)
                goto out;
        xdr_enter_page(&xdr, PAGE_SIZE);
-        status = decode_getfattr(&xdr, &res->fattr, res->server);
+        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+                                 res->fs_locations->server);
 out:
        return status;
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+                                    void *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_exchange_id(&xdr, res);
+        return status;
+}
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+                                       struct nfs41_create_session_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_create_session(&xdr, res);
+        return status;
+}
+/*
+ * a DESTROY_SESSION request
+ */
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
+                                        void *dummy)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_destroy_session(&xdr, dummy);
+        return status;
+}
+/*
+ * a SEQUENCE request
+ */
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+                                 struct nfs4_sequence_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_sequence(&xdr, res, rqstp);
+        return status;
+}
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+                                       struct nfs4_get_lease_time_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+        if (!status)
+                status = decode_putrootfh(&xdr);
+        if (!status)
+                status = decode_fsinfo(&xdr, res->lr_fsinfo);
+        return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
        uint32_t bitmap[2] = {0};
@@ -4686,6 +5487,13 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(GETACL,          enc_getacl,     dec_getacl),
  PROC(SETACL,          enc_setacl,     dec_setacl),
  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+#if defined(CONFIG_NFS_V4_1)
+  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
+  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
+  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
+  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
+  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+#endif /* CONFIG_NFS_V4_1 */
 };
 struct rpc_version              nfs_version4 = {
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index e3ed5908820b..8c55b27c0de4 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -92,6 +92,9 @@
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
+/* Default port to use if server is not running a portmapper */
+#define NFS_MNT_PORT    627
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
@@ -487,6 +490,7 @@ static int __init root_nfs_get_handle(void)
 {
        struct nfs_fh fh;
        struct sockaddr_in sin;
+        unsigned int auth_flav_len = 0;
        struct nfs_mount_request request = {
                .sap            = (struct sockaddr *)&sin,
                .salen          = sizeof(sin),
@@ -496,6 +500,7 @@ static int __init root_nfs_get_handle(void)
                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
                .fh             = &fh,
+                .auth_flav_len  = &auth_flav_len,
        };
        int status;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4ace3c50a8eb..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,10 +18,10 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
-#include <linux/smp_lock.h>
 #include <asm/system.h>
+#include "nfs4_fs.h"
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
@@ -46,6 +46,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
+                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -59,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readdata_free(struct nfs_read_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_rdata_mempool);
 }
-void nfs_readdata_release(void *data)
+static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
-        struct nfs_read_data *rdata = data;
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
@@ -357,19 +356,25 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
        struct nfs_readres *resp = &data->res;
        if (resp->eof || resp->count == argp->count)
-                return;
+                goto out;
        /* This is a short read! */
        nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
        /* Has the server at least made some progress? */
        if (resp->count == 0)
-                return;
+                goto out;
        /* Yes, so retry the read at the end of the data */
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
-        rpc_restart_call(task);
+        nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+        return;
+out:
+        nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client,
+                                &data->res.seq_res);
+        return;
 }
 /*
@@ -406,7 +411,23 @@ static void nfs_readpage_release_partial(void *calldata)
        nfs_readdata_release(calldata);
 }
+#if defined(CONFIG_NFS_V4_1)
+void nfs_read_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_read_data *data = calldata;
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
+                                &data->args.seq_args, &data->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_read_partial_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_readpage_result_partial,
        .rpc_release = nfs_readpage_release_partial,
 };
@@ -470,6 +491,9 @@ static void nfs_readpage_release_full(void *calldata)
 }
 static const struct rpc_call_ops nfs_read_full_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_readpage_result_full,
        .rpc_release = nfs_readpage_release_full,
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 26127b69a275..0b4cbdc60abd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -42,6 +42,8 @@
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/mnt_namespace.h>
+#include <linux/namei.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
@@ -90,6 +92,7 @@ enum {
        Opt_mountport,
        Opt_mountvers,
        Opt_nfsvers,
+        Opt_minorversion,
        /* Mount options that take string arguments */
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
@@ -139,22 +142,23 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_nofscache, "nofsc" },
-        { Opt_port, "port=%u" },
+        { Opt_port, "port=%s" },
-        { Opt_rsize, "rsize=%u" },
+        { Opt_rsize, "rsize=%s" },
-        { Opt_wsize, "wsize=%u" },
+        { Opt_wsize, "wsize=%s" },
-        { Opt_bsize, "bsize=%u" },
+        { Opt_bsize, "bsize=%s" },
-        { Opt_timeo, "timeo=%u" },
+        { Opt_timeo, "timeo=%s" },
-        { Opt_retrans, "retrans=%u" },
+        { Opt_retrans, "retrans=%s" },
-        { Opt_acregmin, "acregmin=%u" },
+        { Opt_acregmin, "acregmin=%s" },
-        { Opt_acregmax, "acregmax=%u" },
+        { Opt_acregmax, "acregmax=%s" },
-        { Opt_acdirmin, "acdirmin=%u" },
+        { Opt_acdirmin, "acdirmin=%s" },
-        { Opt_acdirmax, "acdirmax=%u" },
+        { Opt_acdirmax, "acdirmax=%s" },
-        { Opt_actimeo, "actimeo=%u" },
+        { Opt_actimeo, "actimeo=%s" },
-        { Opt_namelen, "namlen=%u" },
+        { Opt_namelen, "namlen=%s" },
-        { Opt_mountport, "mountport=%u" },
+        { Opt_mountport, "mountport=%s" },
-        { Opt_mountvers, "mountvers=%u" },
+        { Opt_mountvers, "mountvers=%s" },
-        { Opt_nfsvers, "nfsvers=%u" },
+        { Opt_nfsvers, "nfsvers=%s" },
-        { Opt_nfsvers, "vers=%u" },
+        { Opt_nfsvers, "vers=%s" },
+        { Opt_minorversion, "minorversion=%u" },
        { Opt_sec, "sec=%s" },
        { Opt_proto, "proto=%s" },
@@ -270,10 +274,14 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_referral_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
@@ -284,6 +292,14 @@ static struct file_system_type nfs4_fs_type = {
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
+static struct file_system_type nfs4_remote_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs4_remote_get_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
 struct file_system_type nfs4_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
@@ -292,6 +308,14 @@ struct file_system_type nfs4_xdev_fs_type = {
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
+static struct file_system_type nfs4_remote_referral_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs4_remote_referral_get_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
 struct file_system_type nfs4_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
@@ -514,7 +538,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                const char *nostr;
        } nfs_info[] = {
                { NFS_MOUNT_SOFT, ",soft", ",hard" },
-                { NFS_MOUNT_INTR, ",intr", ",nointr" },
                { NFS_MOUNT_POSIX, ",posix", "" },
                { NFS_MOUNT_NOCTO, ",nocto", "" },
                { NFS_MOUNT_NOAC, ",noac", "" },
@@ -943,11 +966,6 @@ static int nfs_parse_security_flavors(char *value,
        return 1;
 }
-static void nfs_parse_invalid_value(const char *option)
-{
-        dfprintk(MOUNT, "NFS:   bad value specified for %s option\n", option);
-}
 /*
 * Error-check and convert a string of mount options from user space into
 * a data structure.  The whole mount string is processed; bad options are
@@ -958,7 +976,7 @@ static int nfs_parse_mount_options(char *raw,
                                   struct nfs_parsed_mount_data *mnt)
 {
        char *p, *string, *secdata;
-        int rc, sloppy = 0, errors = 0;
+        int rc, sloppy = 0, invalid_option = 0;
        if (!raw) {
                dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -982,7 +1000,9 @@ static int nfs_parse_mount_options(char *raw,
        while ((p = strsep(&raw, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
-                int option, token;
+                unsigned long option;
+                int int_option;
+                int token;
                if (!*p)
                        continue;
@@ -1091,114 +1111,156 @@ static int nfs_parse_mount_options(char *raw,
                 * options that take numeric values
                 */
                case Opt_port:
-                        if (match_int(args, &option) ||
+                        string = match_strdup(args);
-                            option < 0 || option > USHORT_MAX) {
+                        if (string == NULL)
-                                errors++;
+                                goto out_nomem;
-                                nfs_parse_invalid_value("port");
+                        rc = strict_strtoul(string, 10, &option);
-                        } else
+                        kfree(string);
-                                mnt->nfs_server.port = option;
+                        if (rc != 0 || option > USHORT_MAX)
+                                goto out_invalid_value;
+                        mnt->nfs_server.port = option;
                        break;
                case Opt_rsize:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("rsize");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->rsize = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->rsize = option;
                        break;
                case Opt_wsize:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("wsize");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->wsize = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->wsize = option;
                        break;
                case Opt_bsize:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("bsize");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->bsize = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        if (match_int(args, &option) || option <= 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("timeo");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->timeo = option;
+                        kfree(string);
+                        if (rc != 0 || option == 0)
+                                goto out_invalid_value;
+                        mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        if (match_int(args, &option) || option <= 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("retrans");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->retrans = option;
+                        kfree(string);
+                        if (rc != 0 || option == 0)
+                                goto out_invalid_value;
+                        mnt->retrans = option;
                        break;
                case Opt_acregmin:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acregmin");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acregmin = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acregmin = option;
                        break;
                case Opt_acregmax:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acregmax");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acregmax = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acregmax = option;
                        break;
                case Opt_acdirmin:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acdirmin");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acdirmin = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acdirmin = option;
                        break;
                case Opt_acdirmax:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acdirmax");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acdirmax = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acdirmax = option;
                        break;
                case Opt_actimeo:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("actimeo");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acregmin = mnt->acregmax =
+                        kfree(string);
-                                mnt->acdirmin = mnt->acdirmax = option;
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acregmin = mnt->acregmax =
+                        mnt->acdirmin = mnt->acdirmax = option;
                        break;
                case Opt_namelen:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("namlen");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->namlen = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->namlen = option;
                        break;
                case Opt_mountport:
-                        if (match_int(args, &option) ||
+                        string = match_strdup(args);
-                            option < 0 || option > USHORT_MAX) {
+                        if (string == NULL)
-                                errors++;
+                                goto out_nomem;
-                                nfs_parse_invalid_value("mountport");
+                        rc = strict_strtoul(string, 10, &option);
-                        } else
+                        kfree(string);
-                                mnt->mount_server.port = option;
+                        if (rc != 0 || option > USHORT_MAX)
+                                goto out_invalid_value;
+                        mnt->mount_server.port = option;
                        break;
                case Opt_mountvers:
-                        if (match_int(args, &option) ||
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        rc = strict_strtoul(string, 10, &option);
+                        kfree(string);
+                        if (rc != 0 ||
                            option < NFS_MNT_VERSION ||
-                            option > NFS_MNT3_VERSION) {
+                            option > NFS_MNT3_VERSION)
-                                errors++;
+                                goto out_invalid_value;
-                                nfs_parse_invalid_value("mountvers");
+                        mnt->mount_server.version = option;
-                        } else
-                                mnt->mount_server.version = option;
                        break;
                case Opt_nfsvers:
-                        if (match_int(args, &option)) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("nfsvers");
+                                goto out_nomem;
-                                break;
+                        rc = strict_strtoul(string, 10, &option);
-                        }
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
                        switch (option) {
                        case NFS2_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1207,10 +1269,16 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_VER3;
                                break;
                        default:
-                                errors++;
+                                goto out_invalid_value;
-                                nfs_parse_invalid_value("nfsvers");
                        }
                        break;
+                case Opt_minorversion:
+                        if (match_int(args, &int_option))
+                                return 0;
+                        if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
+                                return 0;
+                        mnt->minorversion = int_option;
+                        break;
                /*
                 * options that take text values
@@ -1222,9 +1290,9 @@ static int nfs_parse_mount_options(char *raw,
                        rc = nfs_parse_security_flavors(string, mnt);
                        kfree(string);
                        if (!rc) {
-                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "security flavor\n");
+                                return 0;
                        }
                        break;
                case Opt_proto:
@@ -1238,23 +1306,25 @@ static int nfs_parse_mount_options(char *raw,
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+                                kfree(string);
                                break;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+                                kfree(string);
                                break;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
                                xprt_load_transport(string);
+                                kfree(string);
                                break;
                        default:
-                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "transport protocol\n");
+                                return 0;
                        }
-                        kfree(string);
                        break;
                case Opt_mountproto:
                        string = match_strdup(args);
@@ -1273,9 +1343,9 @@ static int nfs_parse_mount_options(char *raw,
                                break;
                        case Opt_xprt_rdma: /* not used for side protocols */
                        default:
-                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "transport protocol\n");
+                                return 0;
                        }
                        break;
                case Opt_addr:
@@ -1331,9 +1401,9 @@ static int nfs_parse_mount_options(char *raw,
                                        mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
                                        break;
                                default:
-                                        errors++;
                                        dfprintk(MOUNT, "NFS:   invalid "
                                                        "lookupcache argument\n");
+                                        return 0;
                        };
                        break;
@@ -1351,20 +1421,20 @@ static int nfs_parse_mount_options(char *raw,
                        break;
                default:
-                        errors++;
+                        invalid_option = 1;
                        dfprintk(MOUNT, "NFS:   unrecognized mount option "
                                        "'%s'\n", p);
                }
        }
-        if (errors > 0) {
+        if (!sloppy && invalid_option)
-                dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+                return 0;
-                                errors, (errors == 1 ? "" : "s"));
-                if (!sloppy)
-                        return 0;
-        }
        return 1;
+out_invalid_value:
+        printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
+        return 0;
 out_nomem:
        printk(KERN_INFO "NFS: not enough memory to parse option\n");
        return 0;
@@ -1381,6 +1451,7 @@ out_security_failure:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         struct nfs_fh *root_fh)
 {
+        unsigned int auth_flavor_len = 0;
        struct nfs_mount_request request = {
                .sap            = (struct sockaddr *)
                                                &args->mount_server.address,
@@ -1388,6 +1459,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                .protocol       = args->mount_server.protocol,
                .fh             = root_fh,
                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
+                .auth_flav_len  = &auth_flavor_len,
        };
        int status;
@@ -2240,6 +2312,11 @@ static void nfs4_fill_super(struct super_block *sb)
        nfs_initialise_sb(sb);
 }
+static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
+{
+        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
+}
 /*
 * Validate NFSv4 mount options
 */
@@ -2263,6 +2340,7 @@ static int nfs4_validate_mount_data(void *options,
        args->nfs_server.port   = NFS_PORT; /* 2049 unless user set port= */
        args->auth_flavors[0]   = RPC_AUTH_UNIX;
        args->auth_flavor_len   = 0;
+        args->minorversion      = 0;
        switch (data->version) {
        case 1:
@@ -2336,6 +2414,8 @@ static int nfs4_validate_mount_data(void *options,
                nfs_validate_transport_protocol(args);
+                nfs4_validate_mount_flags(args);
                if (args->auth_flavor_len > 1)
                        goto out_inval_auth;
@@ -2375,12 +2455,12 @@ out_no_client_address:
 }
 /*
- * Get the superblock for an NFS4 mountpoint
+ * Get the superblock for the NFS4 root partition
 */
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static int nfs4_remote_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-        struct nfs_parsed_mount_data *data;
+        struct nfs_parsed_mount_data *data = raw_data;
        struct super_block *s;
        struct nfs_server *server;
        struct nfs_fh *mntfh;
@@ -2391,18 +2471,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        data = kzalloc(sizeof(*data), GFP_KERNEL);
        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
        security_init_mnt_opts(&data->lsm_opts);
-        /* Validate the mount data */
-        error = nfs4_validate_mount_data(raw_data, data, dev_name);
-        if (error < 0)
-                goto out;
        /* Get a volume representation */
        server = nfs4_create_server(data, mntfh);
        if (IS_ERR(server)) {
@@ -2415,7 +2489,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
                compare_super = NULL;
        /* Get a superblock - note that we may end up sharing one that already exists */
-        s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
+        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto out_free;
@@ -2452,14 +2526,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        error = 0;
 out:
-        kfree(data->client_address);
-        kfree(data->nfs_server.export_path);
-        kfree(data->nfs_server.hostname);
-        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
-        kfree(data);
        return error;
 out_free:
@@ -2473,16 +2542,137 @@ error_splat_super:
        goto out;
 }
+static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
+                int flags, void *data, const char *hostname)
+{
+        struct vfsmount *root_mnt;
+        char *root_devname;
+        size_t len;
+        len = strlen(hostname) + 3;
+        root_devname = kmalloc(len, GFP_KERNEL);
+        if (root_devname == NULL)
+                return ERR_PTR(-ENOMEM);
+        snprintf(root_devname, len, "%s:/", hostname);
+        root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data);
+        kfree(root_devname);
+        return root_mnt;
+}
+static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
+{
+        char *page = (char *) __get_free_page(GFP_KERNEL);
+        char *devname, *tmp;
+        if (page == NULL)
+                return;
+        devname = nfs_path(path->mnt->mnt_devname,
+                        path->mnt->mnt_root, path->dentry,
+                        page, PAGE_SIZE);
+        if (devname == NULL)
+                goto out_freepage;
+        tmp = kstrdup(devname, GFP_KERNEL);
+        if (tmp == NULL)
+                goto out_freepage;
+        kfree(mnt->mnt_devname);
+        mnt->mnt_devname = tmp;
+out_freepage:
+        free_page((unsigned long)page);
+}
+static int nfs_follow_remote_path(struct vfsmount *root_mnt,
+                const char *export_path, struct vfsmount *mnt_target)
+{
+        struct mnt_namespace *ns_private;
+        struct nameidata nd;
+        struct super_block *s;
+        int ret;
+        ns_private = create_mnt_ns(root_mnt);
+        ret = PTR_ERR(ns_private);
+        if (IS_ERR(ns_private))
+                goto out_mntput;
+        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
+                        export_path, LOOKUP_FOLLOW, &nd);
+        put_mnt_ns(ns_private);
+        if (ret != 0)
+                goto out_err;
+        s = nd.path.mnt->mnt_sb;
+        atomic_inc(&s->s_active);
+        mnt_target->mnt_sb = s;
+        mnt_target->mnt_root = dget(nd.path.dentry);
+        /* Correct the device pathname */
+        nfs_fix_devname(&nd.path, mnt_target);
+        path_put(&nd.path);
+        down_write(&s->s_umount);
+        return 0;
+out_mntput:
+        mntput(root_mnt);
+out_err:
+        return ret;
+}
+/*
+ * Get the superblock for an NFS4 mountpoint
+ */
+static int nfs4_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_parsed_mount_data *data;
+        char *export_path;
+        struct vfsmount *root_mnt;
+        int error = -ENOMEM;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                goto out_free_data;
+        /* Validate the mount data */
+        error = nfs4_validate_mount_data(raw_data, data, dev_name);
+        if (error < 0)
+                goto out;
+        export_path = data->nfs_server.export_path;
+        data->nfs_server.export_path = "/";
+        root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+                        data->nfs_server.hostname);
+        data->nfs_server.export_path = export_path;
+        error = PTR_ERR(root_mnt);
+        if (IS_ERR(root_mnt))
+                goto out;
+        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+out:
+        kfree(data->client_address);
+        kfree(data->nfs_server.export_path);
+        kfree(data->nfs_server.hostname);
+        kfree(data->fscache_uniq);
+out_free_data:
+        kfree(data);
+        dprintk("<-- nfs4_get_sb() = %d%s\n", error,
+                        error != 0 ? " [error]" : "");
+        return error;
+}
 static void nfs4_kill_super(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
+        dprintk("--> %s\n", __func__);
        nfs_super_return_all_delegations(sb);
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
        nfs_fscache_release_super_cookie(sb);
        nfs_free_server(server);
+        dprintk("<-- %s\n", __func__);
 }
 /*
@@ -2568,12 +2758,9 @@ error_splat_super:
        return error;
 }
-/*
+static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
- * Create an NFS4 server record on referral traversal
+                int flags, const char *dev_name, void *raw_data,
- */
+                struct vfsmount *mnt)
-static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
-                                const char *dev_name, void *raw_data,
-                                struct vfsmount *mnt)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2652,4 +2839,36 @@ error_splat_super:
        return error;
 }
+/*
+ * Create an NFS4 server record on referral traversal
+ */
+static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data,
+                struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        char *export_path;
+        struct vfsmount *root_mnt;
+        int error;
+        dprintk("--> nfs4_referral_get_sb()\n");
+        export_path = data->mnt_path;
+        data->mnt_path = "/";
+        root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type,
+                        flags, data, data->hostname);
+        data->mnt_path = export_path;
+        error = PTR_ERR(root_mnt);
+        if (IS_ERR(root_mnt))
+                goto out;
+        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+out:
+        dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error,
+                        error != 0 ? " [error]" : "");
+        return error;
+}
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index ecc295347775..1064c91ae810 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -15,6 +15,7 @@
 #include <linux/wait.h>
 #include "internal.h"
+#include "nfs4_fs.h"
 struct nfs_unlinkdata {
        struct hlist_node list;
@@ -82,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
        struct inode *dir = data->dir;
        if (!NFS_PROTO(dir)->unlink_done(task, dir))
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
 }
 /**
@@ -102,9 +103,25 @@ static void nfs_async_unlink_release(void *calldata)
        nfs_sb_deactive(sb);
 }
+#if defined(CONFIG_NFS_V4_1)
+void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_unlinkdata *data = calldata;
+        struct nfs_server *server = NFS_SERVER(data->dir);
+        if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_unlink_ops = {
        .rpc_call_done = nfs_async_unlink_done,
        .rpc_release = nfs_async_unlink_release,
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_unlink_prepare,
+#endif /* CONFIG_NFS_V4_1 */
 };
 static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
@@ -241,6 +258,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                status = PTR_ERR(data->cred);
                goto out_free;
        }
+        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        status = -EBUSY;
        spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e560a78995a3..a34fae21fe10 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -25,6 +25,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "nfs4_fs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -52,6 +53,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
+                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        }
        return p;
 }
@@ -71,6 +73,7 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
+                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -84,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writedata_free(struct nfs_write_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_wdata_mempool);
 }
-void nfs_writedata_release(void *data)
+static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
-        struct nfs_write_data *wdata = data;
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
@@ -199,8 +200,10 @@ static int nfs_set_page_writeback(struct page *page)
                struct nfs_server *nfss = NFS_SERVER(inode);
                if (atomic_long_inc_return(&nfss->writeback) >
-                                NFS_CONGESTION_ON_THRESH)
+                                NFS_CONGESTION_ON_THRESH) {
-                        set_bdi_congested(&nfss->backing_dev_info, WRITE);
+                        set_bdi_congested(&nfss->backing_dev_info,
+                                                BLK_RW_ASYNC);
+                }
        }
        return ret;
 }
@@ -212,7 +215,7 @@ static void nfs_end_page_writeback(struct page *page)
        end_page_writeback(page);
        if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-                clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 /*
@@ -1048,7 +1051,23 @@ out:
        nfs_writedata_release(calldata);
 }
+#if defined(CONFIG_NFS_V4_1)
+void nfs_write_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_write_data *data = calldata;
+        struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
+        if (nfs4_setup_sequence(clp, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_write_partial_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_writeback_done_partial,
        .rpc_release = nfs_writeback_release_partial,
 };
@@ -1111,6 +1130,9 @@ remove_request:
 }
 static const struct rpc_call_ops nfs_write_full_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_writeback_done_full,
        .rpc_release = nfs_writeback_release_full,
 };
@@ -1123,6 +1145,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct nfs_writeargs    *argp = &data->args;
        struct nfs_writeres     *resp = &data->res;
+        struct nfs_server       *server = NFS_SERVER(data->inode);
        int status;
        dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1155,7 +1178,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                if (time_before(complain, jiffies)) {
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
-                                NFS_SERVER(data->inode)->nfs_client->cl_hostname,
+                                server->nfs_client->cl_hostname,
                                resp->verf->committed, argp->stable);
                        complain = jiffies + 300 * HZ;
                }
@@ -1181,7 +1204,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                 */
                                argp->stable = NFS_FILE_SYNC;
                        }
-                        rpc_restart_call(task);
+                        nfs4_restart_rpc(task, server->nfs_client);
                        return -EAGAIN;
                }
                if (time_before(complain, jiffies)) {
@@ -1193,6 +1216,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                /* Can't do anything about it except throw an error. */
                task->tk_status = -EIO;
        }
+        nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
        return 0;
 }
@@ -1349,6 +1373,9 @@ static void nfs_commit_release(void *calldata)
 }
 static const struct rpc_call_ops nfs_commit_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_commit_done,
        .rpc_release = nfs_commit_release,
 };
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 8b1f8efb4690..b92a27629fb7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -464,16 +464,11 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
                if (err)
                        return err;
                /*
-                 * Just a quick sanity check; we could also try to check
+                 * XXX: It would be nice to also check whether this
-                 * whether this pseudoflavor is supported, but at worst
+                 * pseudoflavor is supported, so we can discover the
-                 * an unsupported pseudoflavor on the export would just
+                 * problem at export time instead of when a client fails
-                 * be a pseudoflavor that won't match the flavor of any
+                 * to authenticate.
-                 * authenticated request.  The administrator will
-                 * probably discover the problem when someone fails to
-                 * authenticate.
                 */
-                if (f->pseudoflavor < 0)
-                        return -EINVAL;
                err = get_int(mesg, &f->flags);
                if (err)
                        return err;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7c9fe838f038..a713c418a922 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -652,8 +652,6 @@ nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
 * NFSv3 Server procedures.
 * Only the results of non-idempotent operations are cached.
 */
-#define nfs3svc_decode_voidargs         NULL
-#define nfs3svc_release_void            NULL
 #define nfs3svc_decode_fhandleargs      nfs3svc_decode_fhandle
 #define nfs3svc_encode_attrstatres      nfs3svc_encode_attrstat
 #define nfs3svc_encode_wccstatres       nfs3svc_encode_wccstat
@@ -686,28 +684,219 @@ struct nfsd3_voidargs { int dummy; };
 #define WC (7+pAT)      /* WCC attributes */
 static struct svc_procedure             nfsd_procedures3[22] = {
-  PROC(null,     void,          void,           void,     RC_NOCACHE, ST),
+        [NFS3PROC_NULL] = {
-  PROC(getattr,  fhandle,       attrstat,       fhandle,  RC_NOCACHE, ST+AT),
+                .pc_func = (svc_procfunc) nfsd3_proc_null,
-  PROC(setattr,  sattr,         wccstat,        fhandle,  RC_REPLBUFF, ST+WC),
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_voidres,
-  PROC(lookup,   dirop,         dirop,          fhandle2, RC_NOCACHE, ST+FH+pAT+pAT),
+                .pc_argsize = sizeof(struct nfsd3_voidargs),
-  PROC(access,   access,        access,         fhandle,  RC_NOCACHE, ST+pAT+1),
+                .pc_ressize = sizeof(struct nfsd3_voidres),
-  PROC(readlink, readlink,      readlink,       fhandle,  RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(read,     read,          read,           fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4),
+                .pc_xdrressize = ST,
-  PROC(write,    write,         write,          fhandle,  RC_REPLBUFF, ST+WC+4),
+        },
-  PROC(create,   create,        create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+        [NFS3PROC_GETATTR] = {
-  PROC(mkdir,    mkdir,         create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+                .pc_func = (svc_procfunc) nfsd3_proc_getattr,
-  PROC(symlink,  symlink,       create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
-  PROC(mknod,    mknod,         create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_attrstatres,
-  PROC(remove,   dirop,         wccstat,        fhandle,  RC_REPLBUFF, ST+WC),
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
-  PROC(rmdir,    dirop,         wccstat,        fhandle,  RC_REPLBUFF, ST+WC),
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
-  PROC(rename,   rename,        rename,         fhandle2, RC_REPLBUFF, ST+WC+WC),
+                .pc_ressize = sizeof(struct nfsd3_attrstatres),
-  PROC(link,     link,          link,           fhandle2, RC_REPLBUFF, ST+pAT+WC),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(readdir,  readdir,       readdir,        fhandle,  RC_NOCACHE, 0),
+                .pc_xdrressize = ST+AT,
-  PROC(readdirplus,readdirplus, readdir,        fhandle,  RC_NOCACHE, 0),
+        },
-  PROC(fsstat,   fhandle,       fsstat,         void,     RC_NOCACHE, ST+pAT+2*6+1),
+        [NFS3PROC_SETATTR] = {
-  PROC(fsinfo,   fhandle,       fsinfo,         void,     RC_NOCACHE, ST+pAT+12),
+                .pc_func = (svc_procfunc) nfsd3_proc_setattr,
-  PROC(pathconf, fhandle,       pathconf,       void,     RC_NOCACHE, ST+pAT+6),
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_sattrargs,
-  PROC(commit,   commit,        commit,         fhandle,  RC_NOCACHE, ST+WC+2),
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_sattrargs),
+                .pc_ressize = sizeof(struct nfsd3_wccstatres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC,
+        },
+        [NFS3PROC_LOOKUP] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_lookup,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_diropargs),
+                .pc_ressize = sizeof(struct nfsd3_diropres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+FH+pAT+pAT,
+        },
+        [NFS3PROC_ACCESS] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_access,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_accessargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_accessres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_accessargs),
+                .pc_ressize = sizeof(struct nfsd3_accessres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+1,
+        },
+        [NFS3PROC_READLINK] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_readlink,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readlinkargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readlinkres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readlinkargs),
+                .pc_ressize = sizeof(struct nfsd3_readlinkres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
+        },
+        [NFS3PROC_READ] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_read,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readargs),
+                .pc_ressize = sizeof(struct nfsd3_readres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
+        },
+        [NFS3PROC_WRITE] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_write,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_writeargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_writeres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_writeargs),
+                .pc_ressize = sizeof(struct nfsd3_writeres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC+4,
+        },
+        [NFS3PROC_CREATE] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_create,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_createargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_createargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_MKDIR] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_mkdir,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_mkdirargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_mkdirargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_SYMLINK] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_symlink,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_symlinkargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_symlinkargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_MKNOD] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_mknod,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_mknodargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_mknodargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_REMOVE] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_remove,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_diropargs),
+                .pc_ressize = sizeof(struct nfsd3_wccstatres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC,
+        },
+        [NFS3PROC_RMDIR] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_rmdir,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_diropargs),
+                .pc_ressize = sizeof(struct nfsd3_wccstatres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC,
+        },
+        [NFS3PROC_RENAME] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_rename,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_renameargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_renameres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_renameargs),
+                .pc_ressize = sizeof(struct nfsd3_renameres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC+WC,
+        },
+        [NFS3PROC_LINK] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_link,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_linkargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_linkres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_linkargs),
+                .pc_ressize = sizeof(struct nfsd3_linkres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+pAT+WC,
+        },
+        [NFS3PROC_READDIR] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_readdir,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readdirargs),
+                .pc_ressize = sizeof(struct nfsd3_readdirres),
+                .pc_cachetype = RC_NOCACHE,
+        },
+        [NFS3PROC_READDIRPLUS] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_readdirplus,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirplusargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+                .pc_ressize = sizeof(struct nfsd3_readdirres),
+                .pc_cachetype = RC_NOCACHE,
+        },
+        [NFS3PROC_FSSTAT] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_fsstat,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_fsstatres,
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+                .pc_ressize = sizeof(struct nfsd3_fsstatres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+2*6+1,
+        },
+        [NFS3PROC_FSINFO] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_fsinfo,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_fsinfores,
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+                .pc_ressize = sizeof(struct nfsd3_fsinfores),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+12,
+        },
+        [NFS3PROC_PATHCONF] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_pathconf,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_pathconfres,
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+                .pc_ressize = sizeof(struct nfsd3_pathconfres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+6,
+        },
+        [NFS3PROC_COMMIT] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_commit,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_commitargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_commitres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_commitargs),
+                .pc_ressize = sizeof(struct nfsd3_commitres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+WC+2,
+        },
 };
 struct svc_version      nfsd_version3 = {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 17d0dd997204..01d4ec1c88e0 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -272,6 +272,7 @@ void fill_post_wcc(struct svc_fh *fhp)
        err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
                        &fhp->fh_post_attr);
+        fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
        if (err)
                fhp->fh_post_saved = 0;
        else
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 290289bd44f7..3fd23f7aceca 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -140,8 +140,10 @@ struct nfs4_cb_compound_hdr {
        int             status;
        u32             ident;
        u32             nops;
+        __be32          *nops_p;
+        u32             minorversion;
        u32             taglen;
-        char *          tag;
+        char            *tag;
 };
 static struct {
@@ -201,33 +203,39 @@ nfs_cb_stat_to_errno(int stat)
 * XDR encode
 */
-static int
+static void
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 * p;
        RESERVE_SPACE(16);
        WRITE32(0);            /* tag length is always 0 */
-        WRITE32(NFS4_MINOR_VERSION);
+        WRITE32(hdr->minorversion);
        WRITE32(hdr->ident);
+        hdr->nops_p = p;
        WRITE32(hdr->nops);
-        return 0;
 }
-static int
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
+{
+        *hdr->nops_p = htonl(hdr->nops);
+}
+static void
+encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+                struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 *p;
-        int len = cb_rec->cbr_fh.fh_size;
+        int len = dp->dl_fh.fh_size;
-        RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
+        RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
        WRITE32(OP_CB_RECALL);
-        WRITE32(cb_rec->cbr_stateid.si_generation);
+        WRITE32(dp->dl_stateid.si_generation);
-        WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
+        WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
-        WRITE32(cb_rec->cbr_trunc);
+        WRITE32(0); /* truncate optimization not implemented */
        WRITE32(len);
-        WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
+        WRITEMEM(&dp->dl_fh.fh_base, len);
-        return 0;
+        hdr->nops++;
 }
 static int
@@ -241,17 +249,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 }
 static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args)
 {
        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr = {
-                .ident = args->cbr_ident,
+                .ident = args->dl_ident,
-                .nops   = 1,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_cb_compound_hdr(&xdr, &hdr);
-        return (encode_cb_recall(&xdr, args));
+        encode_cb_recall(&xdr, args, &hdr);
+        encode_cb_nops(&hdr);
+        return 0;
 }
@@ -358,18 +367,21 @@ static struct rpc_program cb_program = {
                .pipe_dir_name  = "/nfsd4_cb",
 };
+static int max_cb_time(void)
+{
+        return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
+}
 /* Reference counting, callback cleanup, etc., all look racy as heck.
 * And why is cb_set an atomic? */
-static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
+int setup_callback_client(struct nfs4_client *clp)
 {
        struct sockaddr_in      addr;
-        struct nfs4_callback    *cb = &clp->cl_callback;
+        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_timeout      timeparms = {
-                .to_initval     = (NFSD_LEASE_TIME/4) * HZ,
+                .to_initval     = max_cb_time(),
-                .to_retries     = 5,
+                .to_retries     = 0,
-                .to_maxval      = (NFSD_LEASE_TIME/2) * HZ,
-                .to_exponential = 1,
        };
        struct rpc_create_args args = {
                .protocol       = IPPROTO_TCP,
@@ -386,7 +398,7 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
        struct rpc_clnt *client;
        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
@@ -396,48 +408,77 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
        /* Create RPC client */
        client = rpc_create(&args);
-        if (IS_ERR(client))
+        if (IS_ERR(client)) {
                dprintk("NFSD: couldn't create callback client: %ld\n",
                        PTR_ERR(client));
-        return client;
+                return PTR_ERR(client);
+        }
+        cb->cb_client = client;
+        return 0;
+}
+static void warn_no_callback_path(struct nfs4_client *clp, int reason)
+{
+        dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+                (int)clp->cl_name.len, clp->cl_name.data, reason);
+}
+static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_client *clp = calldata;
+        if (task->tk_status)
+                warn_no_callback_path(clp, task->tk_status);
+        else
+                atomic_set(&clp->cl_cb_conn.cb_set, 1);
+        put_nfs4_client(clp);
+}
+static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+        .rpc_call_done = nfsd4_cb_probe_done,
+};
+static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
+{
+        struct auth_cred acred = {
+                .machine_cred = 1
+        };
+        /*
+         * Note in the gss case this doesn't actually have to wait for a
+         * gss upcall (or any calls to the client); this just creates a
+         * non-uptodate cred which the rpc state machine will fill in with
+         * a refresh_upcall later.
+         */
+        return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
+                                                        RPCAUTH_LOOKUP_NEW);
 }
-static int do_probe_callback(void *data)
+void do_probe_callback(struct nfs4_client *clp)
 {
-        struct nfs4_client *clp = data;
+        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
-        struct nfs4_callback    *cb = &clp->cl_callback;
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
                .rpc_argp       = clp,
        };
-        struct rpc_clnt *client;
+        struct rpc_cred *cred;
        int status;
-        client = setup_callback_client(clp);
+        cred = lookup_cb_cred(cb);
-        if (IS_ERR(client)) {
+        if (IS_ERR(cred)) {
-                status = PTR_ERR(client);
+                status = PTR_ERR(cred);
-                dprintk("NFSD: couldn't create callback client: %d\n",
+                goto out;
-                                                                status);
+        }
-                goto out_err;
+        cb->cb_cred = cred;
+        msg.rpc_cred = cb->cb_cred;
+        status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
+                                &nfsd4_cb_probe_ops, (void *)clp);
+out:
+        if (status) {
+                warn_no_callback_path(clp, status);
+                put_nfs4_client(clp);
        }
-        status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
-        if (status)
-                goto out_release_client;
-        cb->cb_client = client;
-        atomic_set(&cb->cb_set, 1);
-        put_nfs4_client(clp);
-        return 0;
-out_release_client:
-        rpc_shutdown_client(client);
-out_err:
-        dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
-                (int)clp->cl_name.len, clp->cl_name.data, status);
-        put_nfs4_client(clp);
-        return 0;
 }
 /*
@@ -446,21 +487,65 @@ out_err:
 void
 nfsd4_probe_callback(struct nfs4_client *clp)
 {
-        struct task_struct *t;
+        int status;
-        BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+        BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
+        status = setup_callback_client(clp);
+        if (status) {
+                warn_no_callback_path(clp, status);
+                return;
+        }
        /* the task holds a reference to the nfs4_client struct */
        atomic_inc(&clp->cl_count);
-        t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
+        do_probe_callback(clp);
+}
-        if (IS_ERR(t))
+static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
-                atomic_dec(&clp->cl_count);
+{
+        struct nfs4_delegation *dp = calldata;
+        struct nfs4_client *clp = dp->dl_client;
-        return;
+        switch (task->tk_status) {
+        case -EIO:
+                /* Network partition? */
+                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                warn_no_callback_path(clp, task->tk_status);
+        case -EBADHANDLE:
+        case -NFS4ERR_BAD_STATEID:
+                /* Race: client probably got cb_recall
+                 * before open reply granting delegation */
+                break;
+        default:
+                /* success, or error we can't handle */
+                return;
+        }
+        if (dp->dl_retries--) {
+                rpc_delay(task, 2*HZ);
+                task->tk_status = 0;
+                rpc_restart_call(task);
+        } else {
+                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                warn_no_callback_path(clp, task->tk_status);
+        }
+}
+static void nfsd4_cb_recall_release(void *calldata)
+{
+        struct nfs4_delegation *dp = calldata;
+        struct nfs4_client *clp = dp->dl_client;
+        nfs4_put_delegation(dp);
+        put_nfs4_client(clp);
 }
+static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+        .rpc_call_done = nfsd4_cb_recall_done,
+        .rpc_release = nfsd4_cb_recall_release,
+};
 /*
 * called with dp->dl_count inc'ed.
 */
@@ -468,41 +553,19 @@ void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfs4_client *clp = dp->dl_client;
-        struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-        struct nfs4_cb_recall *cbr = &dp->dl_recall;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-                .rpc_argp = cbr,
+                .rpc_argp = dp,
+                .rpc_cred = clp->cl_cb_conn.cb_cred
        };
-        int retries = 1;
+        int status;
-        int status = 0;
+        dp->dl_retries = 1;
-        cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
+        status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
-        cbr->cbr_dp = dp;
+                                &nfsd4_cb_recall_ops, dp);
+        if (status) {
-        status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
+                put_nfs4_client(clp);
-        while (retries--) {
+                nfs4_put_delegation(dp);
-                switch (status) {
-                        case -EIO:
-                                /* Network partition? */
-                                atomic_set(&clp->cl_callback.cb_set, 0);
-                        case -EBADHANDLE:
-                        case -NFS4ERR_BAD_STATEID:
-                                /* Race: client probably got cb_recall
-                                 * before open reply granting delegation */
-                                break;
-                        default:
-                                goto out_put_cred;
-                }
-                ssleep(2);
-                status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
        }
-out_put_cred:
-        /*
-         * Success or failure, now we're either waiting for lease expiration
-         * or deleg_return.
-         */
-        put_nfs4_client(clp);
-        nfs4_put_delegation(dp);
-        return;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b2883e9c6381..7c8801769a3c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -51,6 +51,78 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
+static u32 nfsd_attrmask[] = {
+        NFSD_WRITEABLE_ATTRS_WORD0,
+        NFSD_WRITEABLE_ATTRS_WORD1,
+        NFSD_WRITEABLE_ATTRS_WORD2
+};
+static u32 nfsd41_ex_attrmask[] = {
+        NFSD_SUPPATTR_EXCLCREAT_WORD0,
+        NFSD_SUPPATTR_EXCLCREAT_WORD1,
+        NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+static __be32
+check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+                   u32 *bmval, u32 *writable)
+{
+        struct dentry *dentry = cstate->current_fh.fh_dentry;
+        struct svc_export *exp = cstate->current_fh.fh_export;
+        /*
+         * Check about attributes are supported by the NFSv4 server or not.
+         * According to spec, unsupported attributes return ERR_ATTRNOTSUPP.
+         */
+        if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) ||
+            (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) ||
+            (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+                return nfserr_attrnotsupp;
+        /*
+         * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported
+         * in current environment or not.
+         */
+        if (bmval[0] & FATTR4_WORD0_ACL) {
+                if (!IS_POSIXACL(dentry->d_inode))
+                        return nfserr_attrnotsupp;
+        }
+        if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
+                if (exp->ex_fslocs.locations == NULL)
+                        return nfserr_attrnotsupp;
+        }
+        /*
+         * According to spec, read-only attributes return ERR_INVAL.
+         */
+        if (writable) {
+                if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+                    (bmval[2] & ~writable[2]))
+                        return nfserr_inval;
+        }
+        return nfs_ok;
+}
+static __be32
+nfsd4_check_open_attributes(struct svc_rqst *rqstp,
+        struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+        __be32 status = nfs_ok;
+        if (open->op_create == NFS4_OPEN_CREATE) {
+                if (open->op_createmode == NFS4_CREATE_UNCHECKED
+                    || open->op_createmode == NFS4_CREATE_GUARDED)
+                        status = check_attr_support(rqstp, cstate,
+                                        open->op_bmval, nfsd_attrmask);
+                else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1)
+                        status = check_attr_support(rqstp, cstate,
+                                        open->op_bmval, nfsd41_ex_attrmask);
+        }
+        return status;
+}
 static inline void
 fh_dup2(struct svc_fh *dst, struct svc_fh *src)
 {
@@ -225,6 +297,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                goto out;
+        status = nfsd4_check_open_attributes(rqstp, cstate, open);
+        if (status)
+                goto out;
        /* Openowner is now set, so sequence id will get bumped.  Now we need
         * these checks before we do any creates: */
        status = nfserr_grace;
@@ -395,6 +471,11 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
+        status = check_attr_support(rqstp, cstate, create->cr_bmval,
+                                    nfsd_attrmask);
+        if (status)
+                return status;
        switch (create->cr_type) {
        case NF4LNK:
                /* ugh! we have to null-terminate the linktext, or
@@ -689,6 +770,12 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
        status = nfs_ok;
+        status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
+                                    nfsd_attrmask);
+        if (status)
+                goto out;
        if (setattr->sa_acl != NULL)
                status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
                                            setattr->sa_acl);
@@ -763,10 +850,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
-        if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
+        status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL);
-            || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+        if (status)
-            || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+                return status;
-                return nfserr_attrnotsupp;
        if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
            || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
                return nfserr_inval;
@@ -1226,24 +1313,9 @@ static const char *nfsd4_op_name(unsigned opnum)
        return "unknown_operation";
 }
-#define nfs4svc_decode_voidargs         NULL
-#define nfs4svc_release_void            NULL
 #define nfsd4_voidres                   nfsd4_voidargs
-#define nfs4svc_release_compound        NULL
 struct nfsd4_voidargs { int dummy; };
-#define PROC(name, argt, rest, relt, cache, respsize)   \
- { (svc_procfunc) nfsd4_proc_##name,            \
-   (kxdrproc_t) nfs4svc_decode_##argt##args,    \
-   (kxdrproc_t) nfs4svc_encode_##rest##res,     \
-   (kxdrproc_t) nfs4svc_release_##relt,         \
-   sizeof(struct nfsd4_##argt##args),           \
-   sizeof(struct nfsd4_##rest##res),            \
-   0,                                           \
-   cache,                                       \
-   respsize,                                    \
- }
 /*
 * TODO: At the present time, the NFSv4 server does not do XID caching
 * of requests.  Implementing XID caching would not be a serious problem,
@@ -1255,8 +1327,23 @@ struct nfsd4_voidargs { int dummy; };
 * better XID's.
 */
 static struct svc_procedure             nfsd_procedures4[2] = {
-  PROC(null,     void,          void,           void,     RC_NOCACHE, 1),
+        [NFSPROC4_NULL] = {
-  PROC(compound, compound,      compound,       compound, RC_NOCACHE, NFSD_BUFSIZE/4)
+                .pc_func = (svc_procfunc) nfsd4_proc_null,
+                .pc_encode = (kxdrproc_t) nfs4svc_encode_voidres,
+                .pc_argsize = sizeof(struct nfsd4_voidargs),
+                .pc_ressize = sizeof(struct nfsd4_voidres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = 1,
+        },
+        [NFSPROC4_COMPOUND] = {
+                .pc_func = (svc_procfunc) nfsd4_proc_compound,
+                .pc_decode = (kxdrproc_t) nfs4svc_decode_compoundargs,
+                .pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres,
+                .pc_argsize = sizeof(struct nfsd4_compoundargs),
+                .pc_ressize = sizeof(struct nfsd4_compoundres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = NFSD_BUFSIZE/4,
+        },
 };
 struct svc_version      nfsd_version4 = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3b711f5147a7..980a216a48c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -182,7 +182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
        struct nfs4_delegation *dp;
        struct nfs4_file *fp = stp->st_file;
-        struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
+        struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
        dprintk("NFSD alloc_init_deleg\n");
        if (fp->fi_had_conflict)
@@ -203,10 +203,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        get_file(stp->st_vfs_file);
        dp->dl_vfs_file = stp->st_vfs_file;
        dp->dl_type = type;
-        dp->dl_recall.cbr_dp = NULL;
+        dp->dl_ident = cb->cb_ident;
-        dp->dl_recall.cbr_ident = cb->cb_ident;
+        dp->dl_stateid.si_boot = get_seconds();
-        dp->dl_recall.cbr_trunc = 0;
-        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
@@ -427,6 +425,11 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
 {
        int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+        if (fchan->maxreqs < 1)
+                return nfserr_inval;
+        else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+                fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
        spin_lock(&nfsd_serv->sv_lock);
        if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
                np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
@@ -446,8 +449,8 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
 * fchan holds the client values on input, and the server values on output
 */
 static int init_forechannel_attrs(struct svc_rqst *rqstp,
-                                    struct nfsd4_session *session,
+                                  struct nfsd4_channel_attrs *session_fchan,
-                                    struct nfsd4_channel_attrs *fchan)
+                                  struct nfsd4_channel_attrs *fchan)
 {
        int status = 0;
        __u32   maxcount = svc_max_payload(rqstp);
@@ -457,21 +460,21 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
        /* Use the client's max request and max response size if possible */
        if (fchan->maxreq_sz > maxcount)
                fchan->maxreq_sz = maxcount;
-        session->se_fmaxreq_sz = fchan->maxreq_sz;
+        session_fchan->maxreq_sz = fchan->maxreq_sz;
        if (fchan->maxresp_sz > maxcount)
                fchan->maxresp_sz = maxcount;
-        session->se_fmaxresp_sz = fchan->maxresp_sz;
+        session_fchan->maxresp_sz = fchan->maxresp_sz;
        /* Set the max response cached size our default which is
         * a multiple of PAGE_SIZE and small */
-        session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+        session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
-        fchan->maxresp_cached = session->se_fmaxresp_cached;
+        fchan->maxresp_cached = session_fchan->maxresp_cached;
        /* Use the client's maxops if possible */
        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
-        session->se_fmaxops = fchan->maxops;
+        session_fchan->maxops = fchan->maxops;
        /* try to use the client requested number of slots */
        if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
@@ -483,7 +486,7 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
         */
        status = set_forechannel_maxreqs(fchan);
-        session->se_fnumslots = fchan->maxreqs;
+        session_fchan->maxreqs = fchan->maxreqs;
        return status;
 }
@@ -497,12 +500,14 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        memset(&tmp, 0, sizeof(tmp));
        /* FIXME: For now, we just accept the client back channel attributes. */
-        status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+        tmp.se_bchannel = cses->back_channel;
+        status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
+                                        &cses->fore_channel);
        if (status)
                goto out;
        /* allocate struct nfsd4_session and slot table in one piece */
-        slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+        slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
        if (!new)
                goto out;
@@ -576,7 +581,7 @@ free_session(struct kref *kref)
        int i;
        ses = container_of(kref, struct nfsd4_session, se_ref);
-        for (i = 0; i < ses->se_fnumslots; i++) {
+        for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
                struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
                nfsd4_release_respages(e->ce_respages, e->ce_resused);
        }
@@ -632,16 +637,20 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static void
 shutdown_callback_client(struct nfs4_client *clp)
 {
-        struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
        if (clnt) {
                /*
                 * Callback threads take a reference on the client, so there
                 * should be no outstanding callbacks at this point.
                 */
-                clp->cl_callback.cb_client = NULL;
+                clp->cl_cb_conn.cb_client = NULL;
                rpc_shutdown_client(clnt);
        }
+        if (clp->cl_cb_conn.cb_cred) {
+                put_rpccred(clp->cl_cb_conn.cb_cred);
+                clp->cl_cb_conn.cb_cred = NULL;
+        }
 }
 static inline void
@@ -714,7 +723,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
                return NULL;
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_count, 1);
-        atomic_set(&clp->cl_callback.cb_set, 0);
+        atomic_set(&clp->cl_cb_conn.cb_set, 0);
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
@@ -966,7 +975,7 @@ parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigne
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 {
-        struct nfs4_callback *cb = &clp->cl_callback;
+        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        /* Currently, we only support tcp for the callback channel */
        if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
@@ -975,6 +984,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
        if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
                         &cb->cb_addr, &cb->cb_port)))
                goto out_err;
+        cb->cb_minorversion = 0;
        cb->cb_prog = se->se_callback_prog;
        cb->cb_ident = se->se_callback_ident;
        return;
@@ -1128,7 +1138,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
         * is sent (lease renewal).
         */
        if (seq && nfsd4_not_cached(resp)) {
-                seq->maxslots = resp->cstate.session->se_fnumslots;
+                seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
                return nfs_ok;
        }
@@ -1238,12 +1248,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                        expire_client(conf);
                        goto out_new;
                }
-                if (ip_addr != conf->cl_addr &&
-                    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
-                        /* Client collision. 18.35.4 case 3 */
-                        status = nfserr_clid_inuse;
-                        goto out;
-                }
                /*
                 * Set bit when the owner id and verifier map to an already
                 * confirmed client id (18.35.3).
@@ -1257,12 +1261,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                copy_verf(conf, &verf);
                new = conf;
                goto out_copy;
-        } else {
+        }
-                /* 18.35.4 case 7 */
-                if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+        /* 18.35.4 case 7 */
-                        status = nfserr_noent;
+        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
-                        goto out;
+                status = nfserr_noent;
-                }
+                goto out;
        }
        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
@@ -1471,7 +1475,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
                goto out;
        status = nfserr_badslot;
-        if (seq->slotid >= session->se_fnumslots)
+        if (seq->slotid >= session->se_fchannel.maxreqs)
                goto out;
        slot = &session->se_slots[seq->slotid];
@@ -1686,9 +1690,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                else {
                        /* XXX: We just turn off callbacks until we can handle
                          * change request correctly. */
-                        atomic_set(&conf->cl_callback.cb_set, 0);
+                        atomic_set(&conf->cl_cb_conn.cb_set, 0);
-                        gen_confirm(conf);
-                        nfsd4_remove_clid_dir(unconf);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1882,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = boot_time;
+        stp->st_stateid.si_boot = get_seconds();
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -2059,19 +2061,6 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
 }
 /*
- * Recall a delegation
- */
-static int
-do_recall(void *__dp)
-{
-        struct nfs4_delegation *dp = __dp;
-        dp->dl_file->fi_had_conflict = true;
-        nfsd4_cb_recall(dp);
-        return 0;
-}
-/*
 * Spawn a thread to perform a recall on the delegation represented
 * by the lease (file_lock)
 *
@@ -2082,8 +2071,7 @@ do_recall(void *__dp)
 static
 void nfsd_break_deleg_cb(struct file_lock *fl)
 {
-        struct nfs4_delegation *dp=  (struct nfs4_delegation *)fl->fl_owner;
+        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        struct task_struct *t;
        dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
        if (!dp)
@@ -2111,16 +2099,8 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         */
        fl->fl_break_time = 0;
-        t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall");
+        dp->dl_file->fi_had_conflict = true;
-        if (IS_ERR(t)) {
+        nfsd4_cb_recall(dp);
-                struct nfs4_client *clp = dp->dl_client;
-                printk(KERN_INFO "NFSD: Callback thread failed for "
-                        "for client (clientid %08x/%08x)\n",
-                        clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
-                put_nfs4_client(dp->dl_client);
-                nfs4_put_delegation(dp);
-        }
 }
 /*
@@ -2422,7 +2402,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        struct nfs4_callback *cb = &sop->so_client->cl_callback;
+        struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
        struct file_lock fl, *flp = &fl;
        int status, flag = 0;
@@ -2614,7 +2594,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_callback.cb_set))
+                        && !atomic_read(&clp->cl_cb_conn.cb_set))
                goto out;
        status = nfs_ok;
 out:
@@ -2738,12 +2718,42 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-        if (stateid->si_boot == boot_time)
+        if (time_after((unsigned long)boot_time,
-                return 0;
+                        (unsigned long)stateid->si_boot)) {
-        dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
+                dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
-                stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid,
+                        stateid->si_boot, stateid->si_stateownerid,
-                stateid->si_generation);
+                        stateid->si_fileid, stateid->si_generation);
-        return 1;
+                return 1;
+        }
+        return 0;
+}
+static int
+EXPIRED_STATEID(stateid_t *stateid)
+{
+        if (time_before((unsigned long)boot_time,
+                        ((unsigned long)stateid->si_boot)) &&
+            time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
+                dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n",
+                        stateid->si_boot, stateid->si_stateownerid,
+                        stateid->si_fileid, stateid->si_generation);
+                return 1;
+        }
+        return 0;
+}
+static __be32
+stateid_error_map(stateid_t *stateid)
+{
+        if (STALE_STATEID(stateid))
+                return nfserr_stale_stateid;
+        if (EXPIRED_STATEID(stateid))
+                return nfserr_expired;
+        dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n",
+                stateid->si_boot, stateid->si_stateownerid,
+                stateid->si_fileid, stateid->si_generation);
+        return nfserr_bad_stateid;
 }
 static inline int
@@ -2867,8 +2877,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
-                if (!dp)
+                if (!dp) {
+                        status = stateid_error_map(stateid);
                        goto out;
+                }
                status = check_stateid_generation(stateid, &dp->dl_stateid,
                                                  flags);
                if (status)
@@ -2881,8 +2893,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        *filpp = dp->dl_vfs_file;
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
-                if (!stp)
+                if (!stp) {
+                        status = stateid_error_map(stateid);
                        goto out;
+                }
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -2956,7 +2970,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
                if (sop == NULL)
-                        return nfserr_bad_stateid;
+                        return stateid_error_map(stateid);
                *sopp = sop;
                goto check_replay;
        }
@@ -3227,8 +3241,10 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (!is_delegation_stateid(stateid))
                goto out;
        dp = find_delegation_stateid(inode, stateid);
-        if (!dp)
+        if (!dp) {
+                status = stateid_error_map(stateid);
                goto out;
+        }
        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
        if (status)
                goto out;
@@ -3455,7 +3471,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = boot_time;
+        stp->st_stateid.si_boot = get_seconds();
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -3987,6 +4003,7 @@ nfs4_state_init(void)
                INIT_LIST_HEAD(&conf_str_hashtbl[i]);
                INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
                INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
+                INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
        }
        for (i = 0; i < SESSION_HASH_SIZE; i++)
                INIT_LIST_HEAD(&sessionid_hashtbl[i]);
@@ -4009,8 +4026,6 @@ nfs4_state_init(void)
        INIT_LIST_HEAD(&close_lru);
        INIT_LIST_HEAD(&client_lru);
        INIT_LIST_HEAD(&del_recall_lru);
-        for (i = 0; i < CLIENT_HASH_SIZE; i++)
-                INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
        reclaim_str_hashtbl_size = 0;
        return 0;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b73549d293be..2dcc7feaa6ff 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -83,16 +83,6 @@ check_filename(char *str, int len, __be32 err)
        return 0;
 }
-/*
- * START OF "GENERIC" DECODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
 #define DECODE_HEAD                             \
        __be32 *p;                              \
        __be32 status
@@ -254,20 +244,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
        DECODE_TAIL;
 }
-static u32 nfsd_attrmask[] = {
-        NFSD_WRITEABLE_ATTRS_WORD0,
-        NFSD_WRITEABLE_ATTRS_WORD1,
-        NFSD_WRITEABLE_ATTRS_WORD2
-};
-static u32 nfsd41_ex_attrmask[] = {
-        NFSD_SUPPATTR_EXCLCREAT_WORD0,
-        NFSD_SUPPATTR_EXCLCREAT_WORD1,
-        NFSD_SUPPATTR_EXCLCREAT_WORD2
-};
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                   struct iattr *iattr, struct nfs4_acl **acl)
 {
        int expected_len, len = 0;
@@ -280,18 +258,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
        if ((status = nfsd4_decode_bitmap(argp, bmval)))
                return status;
-        /*
-         * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
-         * read-only attributes return ERR_INVAL.
-         */
-        if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
-            (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
-            (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
-                return nfserr_attrnotsupp;
-        if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
-            (bmval[2] & ~writable[2]))
-                return nfserr_inval;
        READ_BUF(4);
        READ32(expected_len);
@@ -424,8 +390,11 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
                        goto xdr_error;
                }
        }
-        BUG_ON(bmval[2]);       /* no such writeable attr supported yet */
+        if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
-        if (len != expected_len)
+            || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+            || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
+                READ_BUF(expected_len - len);
+        else if (len != expected_len)
                goto xdr_error;
        DECODE_TAIL;
@@ -518,8 +487,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
                return status;
-        status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+        status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
-                                    &create->cr_iattr, &create->cr_acl);
+                                    &create->cr_acl);
        if (status)
                goto out;
@@ -682,7 +651,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                case NFS4_CREATE_UNCHECKED:
                case NFS4_CREATE_GUARDED:
                        status = nfsd4_decode_fattr(argp, open->op_bmval,
-                                nfsd_attrmask, &open->op_iattr, &open->op_acl);
+                                &open->op_iattr, &open->op_acl);
                        if (status)
                                goto out;
                        break;
@@ -696,8 +665,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                        READ_BUF(8);
                        COPYMEM(open->op_verf.data, 8);
                        status = nfsd4_decode_fattr(argp, open->op_bmval,
-                                nfsd41_ex_attrmask, &open->op_iattr,
+                                &open->op_iattr, &open->op_acl);
-                                &open->op_acl);
                        if (status)
                                goto out;
                        break;
@@ -893,8 +861,8 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
        status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
        if (status)
                return status;
-        return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
+        return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
-                                  &setattr->sa_iattr, &setattr->sa_acl);
+                                  &setattr->sa_acl);
 }
 static __be32
@@ -1328,64 +1296,64 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 };
 static nfsd4_dec nfsd41_dec_ops[] = {
-        [OP_ACCESS]             (nfsd4_dec)nfsd4_decode_access,
+        [OP_ACCESS]             = (nfsd4_dec)nfsd4_decode_access,
-        [OP_CLOSE]              (nfsd4_dec)nfsd4_decode_close,
+        [OP_CLOSE]              = (nfsd4_dec)nfsd4_decode_close,
-        [OP_COMMIT]             (nfsd4_dec)nfsd4_decode_commit,
+        [OP_COMMIT]             = (nfsd4_dec)nfsd4_decode_commit,
-        [OP_CREATE]             (nfsd4_dec)nfsd4_decode_create,
+        [OP_CREATE]             = (nfsd4_dec)nfsd4_decode_create,
-        [OP_DELEGPURGE]         (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DELEGPURGE]         = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_DELEGRETURN]        (nfsd4_dec)nfsd4_decode_delegreturn,
+        [OP_DELEGRETURN]        = (nfsd4_dec)nfsd4_decode_delegreturn,
-        [OP_GETATTR]            (nfsd4_dec)nfsd4_decode_getattr,
+        [OP_GETATTR]            = (nfsd4_dec)nfsd4_decode_getattr,
-        [OP_GETFH]              (nfsd4_dec)nfsd4_decode_noop,
+        [OP_GETFH]              = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_LINK]               (nfsd4_dec)nfsd4_decode_link,
+        [OP_LINK]               = (nfsd4_dec)nfsd4_decode_link,
-        [OP_LOCK]               (nfsd4_dec)nfsd4_decode_lock,
+        [OP_LOCK]               = (nfsd4_dec)nfsd4_decode_lock,
-        [OP_LOCKT]              (nfsd4_dec)nfsd4_decode_lockt,
+        [OP_LOCKT]              = (nfsd4_dec)nfsd4_decode_lockt,
-        [OP_LOCKU]              (nfsd4_dec)nfsd4_decode_locku,
+        [OP_LOCKU]              = (nfsd4_dec)nfsd4_decode_locku,
-        [OP_LOOKUP]             (nfsd4_dec)nfsd4_decode_lookup,
+        [OP_LOOKUP]             = (nfsd4_dec)nfsd4_decode_lookup,
-        [OP_LOOKUPP]            (nfsd4_dec)nfsd4_decode_noop,
+        [OP_LOOKUPP]            = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_NVERIFY]            (nfsd4_dec)nfsd4_decode_verify,
+        [OP_NVERIFY]            = (nfsd4_dec)nfsd4_decode_verify,
-        [OP_OPEN]               (nfsd4_dec)nfsd4_decode_open,
+        [OP_OPEN]               = (nfsd4_dec)nfsd4_decode_open,
-        [OP_OPENATTR]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPENATTR]           = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_OPEN_CONFIRM]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_CONFIRM]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_OPEN_DOWNGRADE]     (nfsd4_dec)nfsd4_decode_open_downgrade,
+        [OP_OPEN_DOWNGRADE]     = (nfsd4_dec)nfsd4_decode_open_downgrade,
-        [OP_PUTFH]              (nfsd4_dec)nfsd4_decode_putfh,
+        [OP_PUTFH]              = (nfsd4_dec)nfsd4_decode_putfh,
-        [OP_PUTPUBFH]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_PUTROOTFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_PUTROOTFH]          = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_READ]               (nfsd4_dec)nfsd4_decode_read,
+        [OP_READ]               = (nfsd4_dec)nfsd4_decode_read,
-        [OP_READDIR]            (nfsd4_dec)nfsd4_decode_readdir,
+        [OP_READDIR]            = (nfsd4_dec)nfsd4_decode_readdir,
-        [OP_READLINK]           (nfsd4_dec)nfsd4_decode_noop,
+        [OP_READLINK]           = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_REMOVE]             (nfsd4_dec)nfsd4_decode_remove,
+        [OP_REMOVE]             = (nfsd4_dec)nfsd4_decode_remove,
-        [OP_RENAME]             (nfsd4_dec)nfsd4_decode_rename,
+        [OP_RENAME]             = (nfsd4_dec)nfsd4_decode_rename,
-        [OP_RENEW]              (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RENEW]              = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_RESTOREFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_RESTOREFH]          = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_SAVEFH]             (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SAVEFH]             = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_SECINFO]            (nfsd4_dec)nfsd4_decode_secinfo,
+        [OP_SECINFO]            = (nfsd4_dec)nfsd4_decode_secinfo,
-        [OP_SETATTR]            (nfsd4_dec)nfsd4_decode_setattr,
+        [OP_SETATTR]            = (nfsd4_dec)nfsd4_decode_setattr,
-        [OP_SETCLIENTID]        (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SETCLIENTID]        = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_VERIFY]             (nfsd4_dec)nfsd4_decode_verify,
+        [OP_VERIFY]             = (nfsd4_dec)nfsd4_decode_verify,
-        [OP_WRITE]              (nfsd4_dec)nfsd4_decode_write,
+        [OP_WRITE]              = (nfsd4_dec)nfsd4_decode_write,
-        [OP_RELEASE_LOCKOWNER]  (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RELEASE_LOCKOWNER]  = (nfsd4_dec)nfsd4_decode_notsupp,
        /* new operations for NFSv4.1 */
-        [OP_BACKCHANNEL_CTL]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BACKCHANNEL_CTL]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_EXCHANGE_ID]        (nfsd4_dec)nfsd4_decode_exchange_id,
+        [OP_EXCHANGE_ID]        = (nfsd4_dec)nfsd4_decode_exchange_id,
-        [OP_CREATE_SESSION]     (nfsd4_dec)nfsd4_decode_create_session,
+        [OP_CREATE_SESSION]     = (nfsd4_dec)nfsd4_decode_create_session,
-        [OP_DESTROY_SESSION]    (nfsd4_dec)nfsd4_decode_destroy_session,
+        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
-        [OP_FREE_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_FREE_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_GETDEVICEINFO]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_GETDEVICELIST]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_LAYOUTCOMMIT]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_LAYOUTGET]          (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_LAYOUTRETURN]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SECINFO_NO_NAME]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SEQUENCE]           (nfsd4_dec)nfsd4_decode_sequence,
+        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
-        [OP_SET_SSV]            (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_TEST_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_WANT_DELEGATION]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_WANT_DELEGATION]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_DESTROY_CLIENTID]   (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_RECLAIM_COMPLETE]   (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_notsupp,
 };
 struct nfsd4_minorversion_ops {
@@ -1489,21 +1457,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        DECODE_TAIL;
 }
-/*
- * END OF "GENERIC" DECODE ROUTINES.
- */
-/*
- * START OF "GENERIC" ENCODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define ENCODE_HEAD              __be32 *p
 #define WRITE32(n)               *p++ = htonl(n)
 #define WRITE64(n)               do {                           \
@@ -1515,13 +1468,41 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        memcpy(p, ptr, nbytes);                                 \
        p += XDR_QUADLEN(nbytes);                               \
 }} while (0)
-#define WRITECINFO(c)           do {                            \
-        *p++ = htonl(c.atomic);                                 \
+static void write32(__be32 **p, u32 n)
-        *p++ = htonl(c.before_ctime_sec);                               \
+{
-        *p++ = htonl(c.before_ctime_nsec);                              \
+        *(*p)++ = n;
-        *p++ = htonl(c.after_ctime_sec);                                \
+}
-        *p++ = htonl(c.after_ctime_nsec);                               \
-} while (0)
+static void write64(__be32 **p, u64 n)
+{
+        write32(p, (u32)(n >> 32));
+        write32(p, (u32)n);
+}
+static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
+{
+        if (IS_I_VERSION(inode)) {
+                write64(p, inode->i_version);
+        } else {
+                write32(p, stat->ctime.tv_sec);
+                write32(p, stat->ctime.tv_nsec);
+        }
+}
+static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
+{
+        write32(p, c->atomic);
+        if (c->change_supported) {
+                write64(p, c->before_change);
+                write64(p, c->after_change);
+        } else {
+                write32(p, c->before_ctime_sec);
+                write32(p, c->before_ctime_nsec);
+                write32(p, c->after_ctime_sec);
+                write32(p, c->after_ctime_nsec);
+        }
+}
 #define RESERVE_SPACE(nbytes)   do {                            \
        p = resp->p;                                            \
@@ -1874,16 +1855,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                        WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
        }
        if (bmval0 & FATTR4_WORD0_CHANGE) {
-                /*
-                 * Note: This _must_ be consistent with the scheme for writing
-                 * change_info, so any changes made here must be reflected there
-                 * as well.  (See xdr4.h:set_change_info() and the WRITECINFO()
-                 * macro above.)
-                 */
                if ((buflen -= 8) < 0)
                        goto out_resource;
-                WRITE32(stat.ctime.tv_sec);
+                write_change(&p, &stat, dentry->d_inode);
-                WRITE32(stat.ctime.tv_nsec);
        }
        if (bmval0 & FATTR4_WORD0_SIZE) {
                if ((buflen -= 8) < 0)
@@ -2348,7 +2322,7 @@ fail:
 static void
 nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(sizeof(stateid_t));
        WRITE32(sid->si_generation);
@@ -2359,7 +2333,7 @@ nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
 static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(8);
@@ -2386,7 +2360,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 static __be32
 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(8);
@@ -2399,11 +2373,11 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 static __be32
 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(32);
-                WRITECINFO(create->cr_cinfo);
+                write_cinfo(&p, &create->cr_cinfo);
                WRITE32(2);
                WRITE32(create->cr_bmval[0]);
                WRITE32(create->cr_bmval[1]);
@@ -2435,7 +2409,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 {
        struct svc_fh *fhp = *fhpp;
        unsigned int len;
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                len = fhp->fh_handle.fh_size;
@@ -2454,7 +2428,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 static void
 nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
        WRITE64(ld->ld_start);
@@ -2510,11 +2484,11 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 static __be32
 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(20);
-                WRITECINFO(link->li_cinfo);
+                write_cinfo(&p, &link->li_cinfo);
                ADJUST_ARGS();
        }
        return nfserr;
@@ -2524,7 +2498,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        ENCODE_SEQID_OP_HEAD;
        if (nfserr)
@@ -2532,7 +2506,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
        nfsd4_encode_stateid(resp, &open->op_stateid);
        RESERVE_SPACE(40);
-        WRITECINFO(open->op_cinfo);
+        write_cinfo(&p, &open->op_cinfo);
        WRITE32(open->op_rflags);
        WRITE32(2);
        WRITE32(open->op_bmval[0]);
@@ -2619,7 +2593,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        int v, pn;
        unsigned long maxcount; 
        long len;
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -2681,7 +2655,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
 {
        int maxcount;
        char *page;
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -2730,7 +2704,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        int maxcount;
        loff_t offset;
        __be32 *page, *savep, *tailbase;
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -2806,11 +2780,11 @@ err_no_verf:
 static __be32
 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(20);
-                WRITECINFO(remove->rm_cinfo);
+                write_cinfo(&p, &remove->rm_cinfo);
                ADJUST_ARGS();
        }
        return nfserr;
@@ -2819,12 +2793,12 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 static __be32
 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(40);
-                WRITECINFO(rename->rn_sinfo);
+                write_cinfo(&p, &rename->rn_sinfo);
-                WRITECINFO(rename->rn_tinfo);
+                write_cinfo(&p, &rename->rn_tinfo);
                ADJUST_ARGS();
        }
        return nfserr;
@@ -2839,7 +2813,7 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
        u32 nflavs;
        struct exp_flavor_info *flavs;
        struct exp_flavor_info def_flavs[2];
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                goto out;
@@ -2904,7 +2878,7 @@ out:
 static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(12);
        if (nfserr) {
@@ -2924,7 +2898,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 static __be32
 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(8 + sizeof(nfs4_verifier));
@@ -2944,7 +2918,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
 static __be32
 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(16);
@@ -2960,7 +2934,7 @@ static __be32
 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
                         struct nfsd4_exchange_id *exid)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        char *major_id;
        char *server_scope;
        int major_id_sz;
@@ -3015,7 +2989,7 @@ static __be32
 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
                            struct nfsd4_create_session *sess)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -3071,7 +3045,7 @@ __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
                      struct nfsd4_sequence *seq)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -3209,7 +3183,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
        dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
                length, xb->page_len, tlen, pad);
-        if (length <= session->se_fmaxresp_cached)
+        if (length <= session->se_fchannel.maxresp_cached)
                return status;
        else
                return nfserr_rep_too_big_to_cache;
@@ -3219,7 +3193,7 @@ void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
        __be32 *statp;
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(8);
        WRITE32(op->opnum);
@@ -3253,7 +3227,7 @@ status:
 void
 nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        struct nfs4_replay *rp = op->replay;
        BUG_ON(!rp);
@@ -3268,10 +3242,6 @@ nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
        ADJUST_ARGS();
 }
-/*
- * END OF "GENERIC" ENCODE ROUTINES.
- */
 int
 nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5bfc2ac60d54..4638635c5d87 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,15 +29,24 @@
 */
 #define CACHESIZE               1024
 #define HASHSIZE                64
-#define REQHASH(xid)            (((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
-static struct hlist_head *      hash_list;
+static struct hlist_head *      cache_hash;
 static struct list_head         lru_head;
 static int                      cache_disabled = 1;
+/*
+ * Calculate the hash index from an XID.
+ */
+static inline u32 request_hash(u32 xid)
+{
+        u32 h = xid;
+        h ^= (xid >> 24);
+        return h & (HASHSIZE-1);
+}
 static int      nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-/* 
+/*
 * locking for the reply cache:
 * A cache entry is "single use" if c_state == RC_INPROG
 * Otherwise, it when accessing _prev or _next, the lock must be held.
@@ -62,8 +71,8 @@ int nfsd_reply_cache_init(void)
                i--;
        }
-        hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+        cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-        if (!hash_list)
+        if (!cache_hash)
                goto out_nomem;
        cache_disabled = 0;
@@ -88,8 +97,8 @@ void nfsd_reply_cache_shutdown(void)
        cache_disabled = 1;
-        kfree (hash_list);
+        kfree (cache_hash);
-        hash_list = NULL;
+        cache_hash = NULL;
 }
 /*
@@ -108,7 +117,7 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
        hlist_del_init(&rp->c_hash);
-        hlist_add_head(&rp->c_hash, hash_list + REQHASH(rp->c_xid));
+        hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
 }
 /*
@@ -138,7 +147,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
        spin_lock(&cache_lock);
        rtn = RC_DOIT;
-        rh = &hash_list[REQHASH(xid)];
+        rh = &cache_hash[request_hash(xid)];
        hlist_for_each_entry(rp, hn, rh, c_hash) {
                if (rp->c_state != RC_UNUSED &&
                    xid == rp->c_xid && proc == rp->c_proc &&
@@ -165,8 +174,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
        }
        }
-        /* This should not happen */
+        /* All entries on the LRU are in-progress. This should not happen */
-        if (rp == NULL) {
+        if (&rp->c_lru == &lru_head) {
                static int      complaints;
                printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
@@ -264,7 +273,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
        len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
        len >>= 2;
-        
        /* Don't cache excessive amounts of data and XDR failures */
        if (!statp || len > (256 >> 2)) {
                rp->c_state = RC_UNUSED;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index af16849d243a..6d0847562d87 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -25,7 +25,6 @@
 #include <linux/init.h>
 #include <linux/inet.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/ctype.h>
 #include <linux/nfs.h>
@@ -207,10 +206,14 @@ static struct file_operations pool_stats_operations = {
 static ssize_t write_svc(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_svc *data;
+        int err;
        if (size < sizeof(*data))
                return -EINVAL;
        data = (struct nfsctl_svc*) buf;
-        return nfsd_svc(data->svc_port, data->svc_nthreads);
+        err = nfsd_svc(data->svc_port, data->svc_nthreads);
+        if (err < 0)
+                return err;
+        return 0;
 }
 /**
@@ -692,11 +695,12 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                if (newthreads < 0)
                        return -EINVAL;
                rv = nfsd_svc(NFS_PORT, newthreads);
-                if (rv)
+                if (rv < 0)
                        return rv;
-        }
+        } else
-        sprintf(buf, "%d\n", nfsd_nrthreads());
+                rv = nfsd_nrthreads();
-        return strlen(buf);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
 }
 /**
@@ -793,7 +797,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
        char *vers, *minorp, sign;
-        int len, num;
+        int len, num, remaining;
        unsigned minor;
        ssize_t tlen = 0;
        char *sep;
@@ -840,32 +844,50 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                        }
                next:
                        vers += len + 1;
-                        tlen += len;
                } while ((len = qword_get(&mesg, vers, size)) > 0);
                /* If all get turned off, turn them back on, as
                 * having no versions is BAD
                 */
                nfsd_reset_versions();
        }
        /* Now write current state into reply buffer */
        len = 0;
        sep = "";
+        remaining = SIMPLE_TRANSACTION_LIMIT;
        for (num=2 ; num <= 4 ; num++)
                if (nfsd_vers(num, NFSD_AVAIL)) {
-                        len += sprintf(buf+len, "%s%c%d", sep,
+                        len = snprintf(buf, remaining, "%s%c%d", sep,
                                       nfsd_vers(num, NFSD_TEST)?'+':'-',
                                       num);
                        sep = " ";
+                        if (len > remaining)
+                                break;
+                        remaining -= len;
+                        buf += len;
+                        tlen += len;
                }
        if (nfsd_vers(4, NFSD_AVAIL))
-                for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+                for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
-                        len += sprintf(buf+len, " %c4.%u",
+                     minor++) {
+                        len = snprintf(buf, remaining, " %c4.%u",
                                        (nfsd_vers(4, NFSD_TEST) &&
                                         nfsd_minorversion(minor, NFSD_TEST)) ?
                                                '+' : '-',
                                        minor);
-        len += sprintf(buf+len, "\n");
-        return len;
+                        if (len > remaining)
+                                break;
+                        remaining -= len;
+                        buf += len;
+                        tlen += len;
+                }
+        len = snprintf(buf, remaining, "\n");
+        if (len > remaining)
+                return -EINVAL;
+        return tlen + len;
 }
 /**
@@ -910,104 +932,143 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
        return rv;
 }
-static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+/*
+ * Zero-length write.  Return a list of NFSD's current listener
+ * transports.
+ */
+static ssize_t __write_ports_names(char *buf)
 {
-        if (size == 0) {
+        if (nfsd_serv == NULL)
-                int len = 0;
+                return 0;
+        return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+}
-                if (nfsd_serv)
+/*
-                        len = svc_xprt_names(nfsd_serv, buf, 0);
+ * A single 'fd' number was written, in which case it must be for
-                return len;
+ * a socket of a supported family/protocol, and we use it as an
-        }
+ * nfsd listener.
-        /* Either a single 'fd' number is written, in which
+ */
-         * case it must be for a socket of a supported family/protocol,
+static ssize_t __write_ports_addfd(char *buf)
-         * and we use it as an nfsd socket, or
+{
-         * A '-' followed by the 'name' of a socket in which case
+        char *mesg = buf;
-         * we close the socket.
+        int fd, err;
-         */
-        if (isdigit(buf[0])) {
+        err = get_int(&mesg, &fd);
-                char *mesg = buf;
+        if (err != 0 || fd < 0)
-                int fd;
+                return -EINVAL;
-                int err;
-                err = get_int(&mesg, &fd);
+        err = nfsd_create_serv();
-                if (err)
+        if (err != 0)
-                        return -EINVAL;
+                return err;
-                if (fd < 0)
-                        return -EINVAL;
+        err = lockd_up();
-                err = nfsd_create_serv();
+        if (err != 0)
-                if (!err) {
+                goto out;
-                        err = svc_addsock(nfsd_serv, fd, buf);
-                        if (err >= 0) {
+        err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
-                                err = lockd_up();
+        if (err < 0)
-                                if (err < 0)
+                lockd_down();
-                                        svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
-                        }
+out:
-                        /* Decrease the count, but don't shutdown the
+        /* Decrease the count, but don't shut down the service */
-                         * the service
+        nfsd_serv->sv_nrthreads--;
-                         */
+        return err;
-                        nfsd_serv->sv_nrthreads--;
+}
-                }
-                return err < 0 ? err : 0;
+/*
-        }
+ * A '-' followed by the 'name' of a socket means we close the socket.
-        if (buf[0] == '-' && isdigit(buf[1])) {
+ */
-                char *toclose = kstrdup(buf+1, GFP_KERNEL);
+static ssize_t __write_ports_delfd(char *buf)
-                int len = 0;
+{
-                if (!toclose)
+        char *toclose;
-                        return -ENOMEM;
+        int len = 0;
-                if (nfsd_serv)
-                        len = svc_sock_names(buf, nfsd_serv, toclose);
+        toclose = kstrdup(buf + 1, GFP_KERNEL);
-                if (len >= 0)
+        if (toclose == NULL)
-                        lockd_down();
+                return -ENOMEM;
-                kfree(toclose);
-                return len;
+        if (nfsd_serv != NULL)
-        }
+                len = svc_sock_names(nfsd_serv, buf,
-        /*
+                                        SIMPLE_TRANSACTION_LIMIT, toclose);
-         * Add a transport listener by writing it's transport name
+        if (len >= 0)
-         */
+                lockd_down();
-        if (isalpha(buf[0])) {
-                int err;
+        kfree(toclose);
-                char transport[16];
+        return len;
-                int port;
+}
-                if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
-                        if (port < 1 || port > 65535)
+/*
-                                return -EINVAL;
+ * A transport listener is added by writing it's transport name and
-                        err = nfsd_create_serv();
+ * a port number.
-                        if (!err) {
+ */
-                                err = svc_create_xprt(nfsd_serv,
+static ssize_t __write_ports_addxprt(char *buf)
-                                                      transport, PF_INET, port,
+{
-                                                      SVC_SOCK_ANONYMOUS);
+        char transport[16];
-                                if (err == -ENOENT)
+        int port, err;
-                                        /* Give a reasonable perror msg for
-                                         * bad transport string */
+        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
-                                        err = -EPROTONOSUPPORT;
+                return -EINVAL;
-                        }
-                        return err < 0 ? err : 0;
+        if (port < 1 || port > USHORT_MAX)
-                }
+                return -EINVAL;
-        }
-        /*
+        err = nfsd_create_serv();
-         * Remove a transport by writing it's transport name and port number
+        if (err != 0)
-         */
+                return err;
-        if (buf[0] == '-' && isalpha(buf[1])) {
-                struct svc_xprt *xprt;
+        err = svc_create_xprt(nfsd_serv, transport,
-                int err = -EINVAL;
+                                PF_INET, port, SVC_SOCK_ANONYMOUS);
-                char transport[16];
+        if (err < 0) {
-                int port;
+                /* Give a reasonable perror msg for bad transport string */
-                if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
+                if (err == -ENOENT)
-                        if (port < 1 || port > 65535)
+                        err = -EPROTONOSUPPORT;
-                                return -EINVAL;
+                return err;
-                        if (nfsd_serv) {
-                                xprt = svc_find_xprt(nfsd_serv, transport,
-                                                     AF_UNSPEC, port);
-                                if (xprt) {
-                                        svc_close_xprt(xprt);
-                                        svc_xprt_put(xprt);
-                                        err = 0;
-                                } else
-                                        err = -ENOTCONN;
-                        }
-                        return err < 0 ? err : 0;
-                }
        }
+        return 0;
+}
+/*
+ * A transport listener is removed by writing a "-", it's transport
+ * name, and it's port number.
+ */
+static ssize_t __write_ports_delxprt(char *buf)
+{
+        struct svc_xprt *xprt;
+        char transport[16];
+        int port;
+        if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
+                return -EINVAL;
+        if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+                return -EINVAL;
+        xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
+        if (xprt == NULL)
+                return -ENOTCONN;
+        svc_close_xprt(xprt);
+        svc_xprt_put(xprt);
+        return 0;
+}
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+{
+        if (size == 0)
+                return __write_ports_names(buf);
+        if (isdigit(buf[0]))
+                return __write_ports_addfd(buf);
+        if (buf[0] == '-' && isdigit(buf[1]))
+                return __write_ports_delfd(buf);
+        if (isalpha(buf[0]))
+                return __write_ports_addxprt(buf);
+        if (buf[0] == '-' && isalpha(buf[1]))
+                return __write_ports_delxprt(buf);
        return -EINVAL;
 }
@@ -1030,7 +1091,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 *                      buf:            C string containing an unsigned
 *                                      integer value representing a bound
 *                                      but unconnected socket that is to be
- *                                      used as an NFSD listener
+ *                                      used as an NFSD listener; listen(3)
+ *                                      must be called for a SOCK_STREAM
+ *                                      socket, otherwise it is ignored
 *                      size:           non-zero length of C string in @buf
 * Output:
 *      On success:     NFS service is started;
@@ -1138,7 +1201,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                nfsd_max_blksize = bsize;
                mutex_unlock(&nfsd_mutex);
        }
-        return sprintf(buf, "%d\n", nfsd_max_blksize);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n",
+                                                        nfsd_max_blksize);
 }
 #ifdef CONFIG_NFSD_V4
@@ -1162,8 +1227,9 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
                        return -EINVAL;
                nfs4_reset_lease(lease);
        }
-        sprintf(buf, "%ld\n", nfs4_lease_time());
-        return strlen(buf);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
+                                                        nfs4_lease_time());
 }
 /**
@@ -1219,8 +1285,9 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
                status = nfs4_reset_recoverydir(recdir);
        }
-        sprintf(buf, "%s\n", nfs4_recoverydir());
-        return strlen(buf);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
+                                                        nfs4_recoverydir());
 }
 /**
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 9f1ca17293d3..8847f3fbfc1e 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -27,9 +27,6 @@
 #define NFSDDBG_FACILITY                NFSDDBG_FH
-static int nfsd_nr_verified;
-static int nfsd_nr_put;
 /*
 * our acceptability function.
 * if NOSUBTREECHECK, accept anything
@@ -251,7 +248,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
        fhp->fh_dentry = dentry;
        fhp->fh_export = exp;
-        nfsd_nr_verified++;
        return 0;
 out:
        exp_put(exp);
@@ -552,7 +548,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                        return nfserr_opnotsupp;
        }
-        nfsd_nr_verified++;
        return 0;
 }
@@ -609,7 +604,6 @@ fh_put(struct svc_fh *fhp)
                fhp->fh_pre_saved = 0;
                fhp->fh_post_saved = 0;
 #endif
-                nfsd_nr_put++;
        }
        if (exp) {
                cache_put(&exp->h, &svc_export_cache);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e298e260b5f1..0eb9c820b7a6 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -533,45 +533,179 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
 * NFSv2 Server procedures.
 * Only the results of non-idempotent operations are cached.
 */
-#define nfsd_proc_none          NULL
-#define nfssvc_release_none     NULL
 struct nfsd_void { int dummy; };
-#define PROC(name, argt, rest, relt, cache, respsize)   \
- { (svc_procfunc) nfsd_proc_##name,             \
-   (kxdrproc_t) nfssvc_decode_##argt,           \
-   (kxdrproc_t) nfssvc_encode_##rest,           \
-   (kxdrproc_t) nfssvc_release_##relt,          \
-   sizeof(struct nfsd_##argt),                  \
-   sizeof(struct nfsd_##rest),                  \
-   0,                                           \
-   cache,                                       \
-   respsize,                                    \
- }
 #define ST 1            /* status */
 #define FH 8            /* filehandle */
 #define AT 18           /* attributes */
 static struct svc_procedure             nfsd_procedures2[18] = {
-  PROC(null,     void,          void,           none,           RC_NOCACHE, ST),
+        [NFSPROC_NULL] = {
-  PROC(getattr,  fhandle,       attrstat,       fhandle,        RC_NOCACHE, ST+AT),
+                .pc_func = (svc_procfunc) nfsd_proc_null,
-  PROC(setattr,  sattrargs,     attrstat,       fhandle,        RC_REPLBUFF, ST+AT),
+                .pc_decode = (kxdrproc_t) nfssvc_decode_void,
-  PROC(none,     void,          void,           none,           RC_NOCACHE, ST),
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
-  PROC(lookup,   diropargs,     diropres,       fhandle,        RC_NOCACHE, ST+FH+AT),
+                .pc_argsize = sizeof(struct nfsd_void),
-  PROC(readlink, readlinkargs,  readlinkres,    none,           RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
+                .pc_ressize = sizeof(struct nfsd_void),
-  PROC(read,     readargs,      readres,        fhandle,        RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(none,     void,          void,           none,           RC_NOCACHE, ST),
+                .pc_xdrressize = ST,
-  PROC(write,    writeargs,     attrstat,       fhandle,        RC_REPLBUFF, ST+AT),
+        },
-  PROC(create,   createargs,    diropres,       fhandle,        RC_REPLBUFF, ST+FH+AT),
+        [NFSPROC_GETATTR] = {
-  PROC(remove,   diropargs,     void,           none,           RC_REPLSTAT, ST),
+                .pc_func = (svc_procfunc) nfsd_proc_getattr,
-  PROC(rename,   renameargs,    void,           none,           RC_REPLSTAT, ST),
+                .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
-  PROC(link,     linkargs,      void,           none,           RC_REPLSTAT, ST),
+                .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
-  PROC(symlink,  symlinkargs,   void,           none,           RC_REPLSTAT, ST),
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
-  PROC(mkdir,    createargs,    diropres,       fhandle,        RC_REPLBUFF, ST+FH+AT),
+                .pc_argsize = sizeof(struct nfsd_fhandle),
-  PROC(rmdir,    diropargs,     void,           none,           RC_REPLSTAT, ST),
+                .pc_ressize = sizeof(struct nfsd_attrstat),
-  PROC(readdir,  readdirargs,   readdirres,     none,           RC_NOCACHE, 0),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(statfs,   fhandle,       statfsres,      none,           RC_NOCACHE, ST+5),
+                .pc_xdrressize = ST+AT,
+        },
+        [NFSPROC_SETATTR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_setattr,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_sattrargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_sattrargs),
+                .pc_ressize = sizeof(struct nfsd_attrstat),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+AT,
+        },
+        [NFSPROC_ROOT] = {
+                .pc_decode = (kxdrproc_t) nfssvc_decode_void,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_void),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_LOOKUP] = {
+                .pc_func = (svc_procfunc) nfsd_proc_lookup,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_diropargs),
+                .pc_ressize = sizeof(struct nfsd_diropres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+FH+AT,
+        },
+        [NFSPROC_READLINK] = {
+                .pc_func = (svc_procfunc) nfsd_proc_readlink,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_readlinkargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_readlinkres,
+                .pc_argsize = sizeof(struct nfsd_readlinkargs),
+                .pc_ressize = sizeof(struct nfsd_readlinkres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+        },
+        [NFSPROC_READ] = {
+                .pc_func = (svc_procfunc) nfsd_proc_read,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_readargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_readres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_readargs),
+                .pc_ressize = sizeof(struct nfsd_readres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+        },
+        [NFSPROC_WRITECACHE] = {
+                .pc_decode = (kxdrproc_t) nfssvc_decode_void,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_void),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_WRITE] = {
+                .pc_func = (svc_procfunc) nfsd_proc_write,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_writeargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_writeargs),
+                .pc_ressize = sizeof(struct nfsd_attrstat),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+AT,
+        },
+        [NFSPROC_CREATE] = {
+                .pc_func = (svc_procfunc) nfsd_proc_create,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_createargs),
+                .pc_ressize = sizeof(struct nfsd_diropres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+FH+AT,
+        },
+        [NFSPROC_REMOVE] = {
+                .pc_func = (svc_procfunc) nfsd_proc_remove,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_diropargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_RENAME] = {
+                .pc_func = (svc_procfunc) nfsd_proc_rename,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_renameargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_renameargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_LINK] = {
+                .pc_func = (svc_procfunc) nfsd_proc_link,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_linkargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_linkargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_SYMLINK] = {
+                .pc_func = (svc_procfunc) nfsd_proc_symlink,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_symlinkargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_symlinkargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_MKDIR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_mkdir,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_createargs),
+                .pc_ressize = sizeof(struct nfsd_diropres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+FH+AT,
+        },
+        [NFSPROC_RMDIR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_rmdir,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_diropargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_READDIR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_readdir,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_readdirargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_readdirres,
+                .pc_argsize = sizeof(struct nfsd_readdirargs),
+                .pc_ressize = sizeof(struct nfsd_readdirres),
+                .pc_cachetype = RC_NOCACHE,
+        },
+        [NFSPROC_STATFS] = {
+                .pc_func = (svc_procfunc) nfsd_proc_statfs,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_statfsres,
+                .pc_argsize = sizeof(struct nfsd_fhandle),
+                .pc_ressize = sizeof(struct nfsd_statfsres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+5,
+        },
 };
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cbba4a935786..492c79b7800b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -18,7 +18,6 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
@@ -390,12 +389,14 @@ nfsd_svc(unsigned short port, int nrservs)
        mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
-        error = -EINVAL;
        if (nrservs <= 0)
                nrservs = 0;
        if (nrservs > NFSD_MAXSERVS)
                nrservs = NFSD_MAXSERVS;
-        
+        error = 0;
+        if (nrservs == 0 && nfsd_serv == NULL)
+                goto out;
        /* Readahead param cache - will no-op if it already exists */
        error = nfsd_racache_init(2*nrservs);
        if (error<0)
@@ -413,6 +414,12 @@ nfsd_svc(unsigned short port, int nrservs)
                goto failure;
        error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
+        if (error == 0)
+                /* We are holding a reference to nfsd_serv which
+                 * we don't want to count in the return value,
+                 * so subtract 1
+                 */
+                error = nfsd_serv->sv_nrthreads - 1;
 failure:
        svc_destroy(nfsd_serv);         /* Release server */
 out:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 99f835753596..23341c1063bc 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -678,7 +678,6 @@ __be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        int access, struct file **filp)
 {
-        const struct cred *cred = current_cred();
        struct dentry   *dentry;
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
@@ -733,7 +732,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                vfs_dq_init(inode);
        }
        *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
-                            flags, cred);
+                            flags, current_cred());
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
        else
@@ -966,6 +965,43 @@ static void kill_suid(struct dentry *dentry)
        mutex_unlock(&dentry->d_inode->i_mutex);
 }
+/*
+ * Gathered writes: If another process is currently writing to the file,
+ * there's a high chance this is another nfsd (triggered by a bulk write
+ * from a client's biod). Rather than syncing the file with each write
+ * request, we sleep for 10 msec.
+ *
+ * I don't know if this roughly approximates C. Juszak's idea of
+ * gathered writes, but it's a nice and simple solution (IMHO), and it
+ * seems to work:-)
+ *
+ * Note: we do this only in the NFSv2 case, since v3 and higher have a
+ * better tool (separate unstable writes and commits) for solving this
+ * problem.
+ */
+static int wait_for_concurrent_writes(struct file *file)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        static ino_t last_ino;
+        static dev_t last_dev;
+        int err = 0;
+        if (atomic_read(&inode->i_writecount) > 1
+            || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+                dprintk("nfsd: write defer %d\n", task_pid_nr(current));
+                msleep(10);
+                dprintk("nfsd: write resume %d\n", task_pid_nr(current));
+        }
+        if (inode->i_state & I_DIRTY) {
+                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+                err = nfsd_sync(file);
+        }
+        last_ino = inode->i_ino;
+        last_dev = inode->i_sb->s_dev;
+        return err;
+}
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
@@ -978,6 +1014,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        __be32                  err = 0;
        int                     host_err;
        int                     stable = *stablep;
+        int                     use_wgather;
 #ifdef MSNFS
        err = nfserr_perm;
@@ -996,9 +1033,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
         *  -   the sync export option has been set, or
         *  -   the client requested O_SYNC behavior (NFSv3 feature).
         *  -   The file system doesn't support fsync().
-         * When gathered writes have been configured for this volume,
+         * When NFSv2 gathered writes have been configured for this volume,
         * flushing the data to disk is handled separately below.
         */
+        use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
        if (!file->f_op->fsync) {/* COMMIT3 cannot work */
               stable = 2;
@@ -1007,7 +1045,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        if (!EX_ISSYNC(exp))
                stable = 0;
-        if (stable && !EX_WGATHER(exp)) {
+        if (stable && !use_wgather) {
                spin_lock(&file->f_lock);
                file->f_flags |= O_SYNC;
                spin_unlock(&file->f_lock);
@@ -1017,52 +1055,20 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        oldfs = get_fs(); set_fs(KERNEL_DS);
        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
        set_fs(oldfs);
-        if (host_err >= 0) {
+        if (host_err < 0)
-                *cnt = host_err;
+                goto out_nfserr;
-                nfsdstats.io_write += host_err;
+        *cnt = host_err;
-                fsnotify_modify(file->f_path.dentry);
+        nfsdstats.io_write += host_err;
-        }
+        fsnotify_modify(file->f_path.dentry);
        /* clear setuid/setgid flag after write */
-        if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+        if (inode->i_mode & (S_ISUID | S_ISGID))
                kill_suid(dentry);
-        if (host_err >= 0 && stable) {
+        if (stable && use_wgather)
-                static ino_t    last_ino;
+                host_err = wait_for_concurrent_writes(file);
-                static dev_t    last_dev;
-                /*
-                 * Gathered writes: If another process is currently
-                 * writing to the file, there's a high chance
-                 * this is another nfsd (triggered by a bulk write
-                 * from a client's biod). Rather than syncing the
-                 * file with each write request, we sleep for 10 msec.
-                 *
-                 * I don't know if this roughly approximates
-                 * C. Juszak's idea of gathered writes, but it's a
-                 * nice and simple solution (IMHO), and it seems to
-                 * work:-)
-                 */
-                if (EX_WGATHER(exp)) {
-                        if (atomic_read(&inode->i_writecount) > 1
-                            || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
-                                dprintk("nfsd: write defer %d\n", task_pid_nr(current));
-                                msleep(10);
-                                dprintk("nfsd: write resume %d\n", task_pid_nr(current));
-                        }
-                        if (inode->i_state & I_DIRTY) {
-                                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-                                host_err=nfsd_sync(file);
-                        }
-#if 0
-                        wake_up(&inode->i_wait);
-#endif
-                }
-                last_ino = inode->i_ino;
-                last_dev = inode->i_sb->s_dev;
-        }
+out_nfserr:
        dprintk("nfsd: write complete host_err=%d\n", host_err);
        if (host_err >= 0)
                err = 0;
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
new file mode 100644
index 000000000000..72da095d4009
--- /dev/null
+++ b/fs/nilfs2/Kconfig
@@ -0,0 +1,25 @@
+config NILFS2_FS
+        tristate "NILFS2 file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select CRC32
+        help
+          NILFS2 is a log-structured file system (LFS) supporting continuous
+          snapshotting.  In addition to versioning capability of the entire
+          file system, users can even restore files mistakenly overwritten or
+          destroyed just a few seconds ago.  Since this file system can keep
+          consistency like conventional LFS, it achieves quick recovery after
+          system crashes.
+          NILFS2 creates a number of checkpoints every few seconds or per
+          synchronous write basis (unless there is no change).  Users can
+          select significant versions among continuously created checkpoints,
+          and can change them into snapshots which will be preserved for long
+          periods until they are changed back to checkpoints.  Each
+          snapshot is mountable as a read-only file system concurrently with
+          its writable mount, and this feature is convenient for online backup.
+          Some features including atime, extended attributes, and POSIX ACLs,
+          are not supported yet.
+          To compile this file system support as a module, choose M here: the
+          module will be called nilfs2.  If unsure, say N.
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 064279e33bbb..99d58a028b94 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -31,21 +31,26 @@
 #include "dat.h"
 #include "alloc.h"
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
                               __u64 *ptrp)
 {
-        __u64 ptr;
+        sector_t blocknr;
        int ret;
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
        if (ret < 0)
                goto out;
-        if (bmap->b_pops->bpop_translate != NULL) {
+        if (NILFS_BMAP_USE_VBN(bmap)) {
-                ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
+                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
-                if (ret < 0)
+                                          &blocknr);
-                        goto out;
+                if (!ret)
-                *ptrp = ptr;
+                        *ptrp = blocknr;
        }
 out:
@@ -53,6 +58,16 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        return ret;
 }
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
+                             unsigned maxblocks)
+{
+        int ret;
+        down_read(&bmap->b_sem);
+        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
+        up_read(&bmap->b_sem);
+        return ret;
+}
 /**
 * nilfs_bmap_lookup - find a record
@@ -101,8 +116,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
                        if (n < 0)
                                return n;
                        ret = nilfs_btree_convert_and_insert(
-                                bmap, key, ptr, keys, ptrs, n,
+                                bmap, key, ptr, keys, ptrs, n);
-                                NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
                        if (ret == 0)
                                bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
@@ -158,8 +172,7 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
                        if (n < 0)
                                return n;
                        ret = nilfs_direct_delete_and_convert(
-                                bmap, key, keys, ptrs, n,
+                                bmap, key, keys, ptrs, n);
-                                NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
                        if (ret == 0)
                                bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
@@ -417,38 +430,6 @@ void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
                mark_inode_dirty(bmap->b_inode);
 }
-int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
-                         struct buffer_head **bhp)
-{
-        return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-                                ptr, 0, bhp, 0);
-}
-void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
-                          struct buffer_head *bh)
-{
-        brelse(bh);
-}
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
-                             struct buffer_head **bhp)
-{
-        int ret;
-        ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-                               ptr, 0, bhp, 1);
-        if (ret < 0)
-                return ret;
-        set_buffer_nilfs_volatile(*bhp);
-        return 0;
-}
-void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
-                             struct buffer_head *bh)
-{
-        nilfs_btnode_delete(bh);
-}
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
                              const struct buffer_head *bh)
 {
@@ -476,11 +457,6 @@ __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
                return NILFS_BMAP_INVALID_PTR;
 }
-static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
-{
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
-}
 #define NILFS_BMAP_GROUP_DIV    8
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
 {
@@ -493,64 +469,51 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
                (entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
-static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
+                                 union nilfs_bmap_ptr_req *req)
 {
        return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
+                                 union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-                                     union nilfs_bmap_ptr_req *req)
+                              union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
-                                      union nilfs_bmap_ptr_req *req)
+                       sector_t blocknr)
 {
-        return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-}
+        int ret;
-static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req,
-                                      sector_t blocknr)
-{
-        nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-                               blocknr);
-}
-static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
+        ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
-                                     union nilfs_bmap_ptr_req *req)
+        if (likely(!ret))
-{
+                nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
-        nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+        return ret;
 }
-static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-                                    union nilfs_bmap_ptr_req *req)
+                             union nilfs_bmap_ptr_req *req)
 {
        return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-                                    union nilfs_bmap_ptr_req *req)
+                             union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
-}
-static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
-                                       union nilfs_bmap_ptr_req *req)
 {
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
-static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-                                   union nilfs_bmap_ptr_req *req)
+                            union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
@@ -566,129 +529,46 @@ int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
 }
-int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *oldreq,
+                                union nilfs_bmap_ptr_req *oldreq,
-                              union nilfs_bmap_ptr_req *newreq)
+                                union nilfs_bmap_ptr_req *newreq)
 {
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
        int ret;
-        ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+        ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
        if (ret < 0)
                return ret;
-        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+        ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
        if (ret < 0)
-                bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+                nilfs_dat_abort_end(dat, &oldreq->bpr_req);
        return ret;
 }
-void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *oldreq,
+                                union nilfs_bmap_ptr_req *oldreq,
-                              union nilfs_bmap_ptr_req *newreq)
+                                union nilfs_bmap_ptr_req *newreq)
 {
-        bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
-}
-void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
+        nilfs_dat_commit_end(dat, &oldreq->bpr_req,
-                             union nilfs_bmap_ptr_req *oldreq,
+                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-                             union nilfs_bmap_ptr_req *newreq)
+        nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
-{
-        bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
 }
-static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
-                                  __u64 *ptrp)
+                               union nilfs_bmap_ptr_req *oldreq,
+                               union nilfs_bmap_ptr_req *newreq)
 {
-        sector_t blocknr;
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        int ret;
-        ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
-        if (ret < 0)
-                return ret;
-        if (ptrp != NULL)
-                *ptrp = blocknr;
-        return 0;
-}
-static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
+        nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-                                      union nilfs_bmap_ptr_req *req)
+        nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
-{
-        /* ignore target ptr */
-        req->bpr_ptr = bmap->b_last_allocated_ptr++;
-        return 0;
 }
-static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
-{
-        /* do nothing */
-}
-static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
-                                     union nilfs_bmap_ptr_req *req)
-{
-        bmap->b_last_allocated_ptr--;
-}
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
-        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
-        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
-        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
-        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
-        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_v,
-        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
-        .bpop_translate         =       nilfs_bmap_translate_v,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
-        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
-        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
-        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
-        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
-        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_vmdt,
-        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
-        .bpop_translate         =       nilfs_bmap_translate_v,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_p,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_p,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_p,
-        .bpop_prepare_start_ptr =       NULL,
-        .bpop_commit_start_ptr  =       NULL,
-        .bpop_abort_start_ptr   =       NULL,
-        .bpop_prepare_end_ptr   =       NULL,
-        .bpop_commit_end_ptr    =       NULL,
-        .bpop_abort_end_ptr     =       NULL,
-        .bpop_translate         =       NULL,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
-        .bpop_prepare_alloc_ptr =       NULL,
-        .bpop_commit_alloc_ptr  =       NULL,
-        .bpop_abort_alloc_ptr   =       NULL,
-        .bpop_prepare_start_ptr =       NULL,
-        .bpop_commit_start_ptr  =       NULL,
-        .bpop_abort_start_ptr   =       NULL,
-        .bpop_prepare_end_ptr   =       NULL,
-        .bpop_commit_end_ptr    =       NULL,
-        .bpop_abort_end_ptr     =       NULL,
-        .bpop_translate         =       NULL,
-};
 static struct lock_class_key nilfs_bmap_dat_lock_key;
+static struct lock_class_key nilfs_bmap_mdt_lock_key;
 /**
 * nilfs_bmap_read - read a bmap from an inode
@@ -714,31 +594,30 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
        switch (bmap->b_inode->i_ino) {
        case NILFS_DAT_INO:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_p;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_P;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
                break;
        case NILFS_CPFILE_INO:
        case NILFS_SUFILE_INO:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
                break;
+        case NILFS_IFILE_INO:
+                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
+                /* Fall through */
        default:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_v;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
                break;
        }
        return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
-                nilfs_btree_init(bmap,
+                nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
-                                 NILFS_BMAP_LARGE_LOW,
-                                 NILFS_BMAP_LARGE_HIGH) :
-                nilfs_direct_init(bmap,
-                                  NILFS_BMAP_SMALL_LOW,
-                                  NILFS_BMAP_SMALL_HIGH);
 }
 /**
@@ -764,7 +643,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
        memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
        init_rwsem(&bmap->b_sem);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
-        bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+        bmap->b_ptr_type = NILFS_BMAP_PTR_U;
        bmap->b_last_allocated_key = 0;
        bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
        bmap->b_state = 0;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 4f2708abb1ba..b2890cdcef12 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -64,6 +64,8 @@ struct nilfs_bmap_stats {
 */
 struct nilfs_bmap_operations {
        int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+        int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
+                                 unsigned);
        int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
        int (*bop_delete)(struct nilfs_bmap *, __u64);
        void (*bop_clear)(struct nilfs_bmap *);
@@ -86,34 +88,6 @@ struct nilfs_bmap_operations {
 };
-/**
- * struct nilfs_bmap_ptr_operations - bmap ptr operation table
- */
-struct nilfs_bmap_ptr_operations {
-        int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
-                                     union nilfs_bmap_ptr_req *);
-        int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *,
-                                      sector_t);
-        void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
-                                     union nilfs_bmap_ptr_req *);
-        int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
-                                    union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
-                                    union nilfs_bmap_ptr_req *);
-        void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
-                                   union nilfs_bmap_ptr_req *);
-        int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
-};
 #define NILFS_BMAP_SIZE         (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
 #define NILFS_BMAP_KEY_BIT      (sizeof(unsigned long) * 8 /* CHAR_BIT */)
 #define NILFS_BMAP_NEW_PTR_INIT \
@@ -131,11 +105,9 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
 * @b_sem: semaphore
 * @b_inode: owner of bmap
 * @b_ops: bmap operation table
- * @b_pops: bmap ptr operation table
- * @b_low: low watermark of conversion
- * @b_high: high watermark of conversion
 * @b_last_allocated_key: last allocated key for data block
 * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_ptr_type: pointer type
 * @b_state: state
 */
 struct nilfs_bmap {
@@ -146,14 +118,22 @@ struct nilfs_bmap {
        struct rw_semaphore b_sem;
        struct inode *b_inode;
        const struct nilfs_bmap_operations *b_ops;
-        const struct nilfs_bmap_ptr_operations *b_pops;
-        __u64 b_low;
-        __u64 b_high;
        __u64 b_last_allocated_key;
        __u64 b_last_allocated_ptr;
+        int b_ptr_type;
        int b_state;
 };
+/* pointer type */
+#define NILFS_BMAP_PTR_P        0       /* physical block number (i.e. LBN) */
+#define NILFS_BMAP_PTR_VS       1       /* virtual block number (single
+                                           version) */
+#define NILFS_BMAP_PTR_VM       2       /* virtual block number (has multiple
+                                           versions) */
+#define NILFS_BMAP_PTR_U        (-1)    /* never perform pointer operations */
+#define NILFS_BMAP_USE_VBN(bmap)        ((bmap)->b_ptr_type > 0)
 /* state */
 #define NILFS_BMAP_DIRTY        0x00000001
@@ -162,6 +142,7 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
 int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
@@ -182,7 +163,67 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 /*
 * Internal use only
 */
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
+                               union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
+                               union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
+                              union nilfs_bmap_ptr_req *);
+static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
+                                               union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                return nilfs_bmap_prepare_alloc_v(bmap, req);
+        /* ignore target ptr */
+        req->bpr_ptr = bmap->b_last_allocated_ptr++;
+        return 0;
+}
+static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
+                                               union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_commit_alloc_v(bmap, req);
+}
+static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
+                                              union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_abort_alloc_v(bmap, req);
+        else
+                bmap->b_last_allocated_ptr--;
+}
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
+                                             union nilfs_bmap_ptr_req *req)
+{
+        return NILFS_BMAP_USE_VBN(bmap) ?
+                nilfs_bmap_prepare_end_v(bmap, req) : 0;
+}
+static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
+                                             union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_commit_end_v(bmap, req);
+}
+static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
+                                            union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_abort_end_v(bmap, req);
+}
+int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
+                       sector_t);
 int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
 int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
@@ -193,28 +234,20 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_update(struct nilfs_bmap *,
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *,
+                                union nilfs_bmap_ptr_req *,
-                              union nilfs_bmap_ptr_req *);
+                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update(struct nilfs_bmap *,
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *,
+                                union nilfs_bmap_ptr_req *,
-                              union nilfs_bmap_ptr_req *);
+                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update(struct nilfs_bmap *,
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
-                             union nilfs_bmap_ptr_req *,
+                               union nilfs_bmap_ptr_req *,
-                             union nilfs_bmap_ptr_req *);
+                               union nilfs_bmap_ptr_req *);
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
-int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
-                         struct buffer_head **);
-void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
-                             struct buffer_head **);
-void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
 /* Assume that bmap semaphore is locked. */
 static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
 {
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4cc07b2c30e0..7e0b61be212e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,15 +46,18 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
        INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
 }
-static struct address_space_operations def_btnode_aops;
+static struct address_space_operations def_btnode_aops = {
+        .sync_page              = block_sync_page,
+};
-void nilfs_btnode_cache_init(struct address_space *btnc)
+void nilfs_btnode_cache_init(struct address_space *btnc,
+                             struct backing_dev_info *bdi)
 {
        btnc->host = NULL;  /* can safely set to host inode ? */
        btnc->flags = 0;
        mapping_set_gfp_mask(btnc, GFP_NOFS);
        btnc->assoc_mapping = NULL;
-        btnc->backing_dev_info = &default_backing_dev_info;
+        btnc->backing_dev_info = bdi;
        btnc->a_ops = &def_btnode_aops;
 }
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 35faa86444a7..3e2275172ed6 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -38,7 +38,7 @@ struct nilfs_btnode_chkey_ctxt {
 };
 void nilfs_btnode_cache_init_once(struct address_space *);
-void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
                              struct buffer_head **, int);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 6b37a2767293..aa412724b64e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -29,6 +29,7 @@
 #include "btnode.h"
 #include "btree.h"
 #include "alloc.h"
+#include "dat.h"
 /**
 * struct nilfs_btree_path - A path on which B-tree operations are executed
@@ -109,8 +110,7 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
             level < NILFS_BTREE_LEVEL_MAX;
             level++) {
                if (path[level].bp_bh != NULL) {
-                        nilfs_bmap_put_block(&btree->bt_bmap,
+                        brelse(path[level].bp_bh);
-                                             path[level].bp_bh);
                        path[level].bp_bh = NULL;
                }
                /* sib_bh is released or deleted by prepare or commit
@@ -123,10 +123,29 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
        }
 }
 /*
 * B-tree node operations
 */
+static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
+                                 struct buffer_head **bhp)
+{
+        struct address_space *btnc =
+                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+        return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+}
+static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
+                                     __u64 ptr, struct buffer_head **bhp)
+{
+        struct address_space *btnc =
+                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+        int ret;
+        ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
+        if (!ret)
+                set_buffer_nilfs_volatile(*bhp);
+        return ret;
+}
 static inline int
 nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
@@ -488,8 +507,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
        path[level].bp_index = index;
        for (level--; level >= minlevel; level--) {
-                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
-                                           &path[level].bp_bh);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -535,8 +553,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
        path[level].bp_index = index;
        for (level--; level > 0; level--) {
-                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
-                                           &path[level].bp_bh);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -579,6 +596,87 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        return ret;
 }
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
+                                     __u64 key, __u64 *ptrp, unsigned maxblocks)
+{
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct nilfs_btree_path *path;
+        struct nilfs_btree_node *node;
+        struct inode *dat = NULL;
+        __u64 ptr, ptr2;
+        sector_t blocknr;
+        int level = NILFS_BTREE_LEVEL_NODE_MIN;
+        int ret, cnt, index, maxlevel;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        if (ret < 0)
+                goto out;
+        if (NILFS_BMAP_USE_VBN(bmap)) {
+                dat = nilfs_bmap_get_dat(bmap);
+                ret = nilfs_dat_translate(dat, ptr, &blocknr);
+                if (ret < 0)
+                        goto out;
+                ptr = blocknr;
+        }
+        cnt = 1;
+        if (cnt == maxblocks)
+                goto end;
+        maxlevel = nilfs_btree_height(btree) - 1;
+        node = nilfs_btree_get_node(btree, path, level);
+        index = path[level].bp_index + 1;
+        for (;;) {
+                while (index < nilfs_btree_node_get_nchildren(btree, node)) {
+                        if (nilfs_btree_node_get_key(btree, node, index) !=
+                            key + cnt)
+                                goto end;
+                        ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                        if (dat) {
+                                ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+                                if (ret < 0)
+                                        goto out;
+                                ptr2 = blocknr;
+                        }
+                        if (ptr2 != ptr + cnt || ++cnt == maxblocks)
+                                goto end;
+                        index++;
+                        continue;
+                }
+                if (level == maxlevel)
+                        break;
+                /* look-up right sibling node */
+                node = nilfs_btree_get_node(btree, path, level + 1);
+                index = path[level + 1].bp_index + 1;
+                if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
+                    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+                        break;
+                ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                path[level + 1].bp_index = index;
+                brelse(path[level].bp_bh);
+                path[level].bp_bh = NULL;
+                ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
+                if (ret < 0)
+                        goto out;
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                index = 0;
+                path[level].bp_index = index;
+        }
+ end:
+        *ptrp = ptr;
+        ret = cnt;
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
 static void nilfs_btree_promote_key(struct nilfs_btree *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 key)
@@ -669,13 +767,13 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(btree, node, 0));
        if (move) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index += lnchildren;
                path[level + 1].bp_index--;
        } else {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -= n;
        }
@@ -722,14 +820,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        path[level + 1].bp_index--;
        if (move) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -=
                        nilfs_btree_node_get_nchildren(btree, node);
                path[level + 1].bp_index++;
        } else {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }
@@ -781,7 +879,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                *keyp = nilfs_btree_node_get_key(btree, right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
        } else {
@@ -790,7 +888,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                *keyp = nilfs_btree_node_get_key(btree, right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }
@@ -897,12 +995,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        level = NILFS_BTREE_LEVEL_DATA;
        /* allocate a new ptr for data block */
-        if (btree->bt_ops->btop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
                path[level].bp_newreq.bpr_ptr =
-                        btree->bt_ops->btop_find_target(btree, path, key);
+                        nilfs_btree_find_target_v(btree, path, key);
-        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        if (ret < 0)
                goto err_out_data;
@@ -924,8 +1022,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                if (pindex > 0) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex - 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -936,7 +1033,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                                stats->bs_nblocks++;
                                goto out;
                        } else
-                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                                brelse(bh);
                }
                /* right sibling */
@@ -944,8 +1041,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -956,19 +1052,19 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                                stats->bs_nblocks++;
                                goto out;
                        } else
-                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                                brelse(bh);
                }
                /* split */
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
-                ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_newreq);
+                                                   &path[level].bp_newreq);
                if (ret < 0)
                        goto err_out_child_node;
-                ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+                ret = nilfs_btree_get_new_block(btree,
-                                               path[level].bp_newreq.bpr_ptr,
+                                                path[level].bp_newreq.bpr_ptr,
-                                               &bh);
+                                                &bh);
                if (ret < 0)
                        goto err_out_curr_node;
@@ -994,12 +1090,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
-        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        if (ret < 0)
                goto err_out_child_node;
-        ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
-                                       path[level].bp_newreq.bpr_ptr, &bh);
+                                        &bh);
        if (ret < 0)
                goto err_out_curr_node;
@@ -1023,18 +1119,16 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
-                                                    &path[level].bp_newreq);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
-                nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                nilfs_btnode_delete(path[level].bp_sib_bh);
-                btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
+                nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        }
-        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
-                                                       &path[level].bp_newreq);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1049,14 +1143,12 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-        if (btree->bt_ops->btop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
-                btree->bt_ops->btop_set_target(btree, key, ptr);
+                nilfs_btree_set_target_v(btree, key, ptr);
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
+                nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
+                                            &path[level - 1].bp_newreq);
-                                &btree->bt_bmap, &path[level - 1].bp_newreq);
-                }
                path[level].bp_op(btree, path, level, &key, &ptr);
        }
@@ -1153,7 +1245,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(btree, node, 0));
-        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += n;
 }
@@ -1192,7 +1284,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(btree, right, 0));
        path[level + 1].bp_index--;
-        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
 }
@@ -1221,7 +1313,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
@@ -1252,7 +1344,7 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        nilfs_btnode_delete(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level + 1].bp_index++;
 }
@@ -1276,7 +1368,7 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        nilfs_btree_node_move_left(btree, root, child, n);
        unlock_buffer(path[level].bp_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
 }
@@ -1300,12 +1392,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                path[level].bp_oldreq.bpr_ptr =
                        nilfs_btree_node_get_ptr(btree, node,
                                                 path[level].bp_index);
-                if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                        ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+                                                 &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
+                if (ret < 0)
-                        if (ret < 0)
+                        goto err_out_child_node;
-                                goto err_out_child_node;
-                }
                if (nilfs_btree_node_get_nchildren(btree, node) >
                    nilfs_btree_node_nchildren_min(btree, node)) {
@@ -1321,8 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* left sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex - 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1343,8 +1432,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* right sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1381,12 +1469,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        node = nilfs_btree_get_root(btree);
        path[level].bp_oldreq.bpr_ptr =
                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
-        if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
-                ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+        ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_oldreq);
+                                         &path[level].bp_oldreq);
-                if (ret < 0)
+        if (ret < 0)
-                        goto err_out_child_node;
+                goto err_out_child_node;
-        }
        /* child of the root node is deleted */
        path[level].bp_op = nilfs_btree_do_delete;
        stats->bs_nblocks++;
@@ -1398,15 +1486,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
-                btree->bt_bmap.b_pops->bpop_abort_end_ptr(
-                        &btree->bt_bmap, &path[level].bp_oldreq);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
-                if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+                nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+                                         &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1420,9 +1505,8 @@ static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
        int level;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
+                nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_commit_end_ptr(
+                                          &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }
@@ -1501,7 +1585,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                if (nchildren > 1)
                        return 0;
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1515,9 +1599,9 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
        nextmaxkey = (nchildren > 1) ?
                nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
        if (bh != NULL)
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
-        return (maxkey == key) && (nextmaxkey < bmap->b_low);
+        return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
 }
 static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
@@ -1542,7 +1626,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
                nchildren = nilfs_btree_node_get_nchildren(btree, root);
                WARN_ON(nchildren > 1);
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1563,7 +1647,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
        }
        if (bh != NULL)
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
        return nitems;
 }
@@ -1584,10 +1668,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* for data */
        /* cannot find near ptr */
-        if (btree->bt_ops->btop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(bmap))
-                dreq->bpr_ptr
+                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
-                        = btree->bt_ops->btop_find_target(btree, NULL, key);
-        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
        if (ret < 0)
                return ret;
@@ -1595,11 +1679,11 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
-                ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
                if (ret < 0)
                        goto err_out_dreq;
-                ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+                ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
                if (ret < 0)
                        goto err_out_nreq;
@@ -1612,9 +1696,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* error */
 err_out_nreq:
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, nreq);
 err_out_dreq:
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, dreq);
        stats->bs_nblocks = 0;
        return ret;
@@ -1624,7 +1708,7 @@ static void
 nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                                      __u64 key, __u64 ptr,
                                      const __u64 *keys, const __u64 *ptrs,
-                                      int n, __u64 low, __u64 high,
+                                      int n,
                                      union nilfs_bmap_ptr_req *dreq,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
@@ -1642,12 +1726,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
        /* convert and insert */
        btree = (struct nilfs_btree *)bmap;
-        nilfs_btree_init(bmap, low, high);
+        nilfs_btree_init(bmap);
        if (nreq != NULL) {
-                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, nreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
-                }
                /* create child node at level 1 */
                lock_buffer(bh);
@@ -1661,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                        nilfs_bmap_set_dirty(bmap);
                unlock_buffer(bh);
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
                /* create root node at level 2 */
                node = nilfs_btree_get_root(btree);
@@ -1669,8 +1751,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
                                      2, 1, &keys[0], &tmpptr);
        } else {
-                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
@@ -1682,8 +1763,8 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                        nilfs_bmap_set_dirty(bmap);
        }
-        if (btree->bt_ops->btop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(bmap))
-                btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+                nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
 }
 /**
@@ -1694,13 +1775,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 * @keys:
 * @ptrs:
 * @n:
- * @low:
- * @high:
 */
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
                                   __u64 key, __u64 ptr,
-                                   const __u64 *keys, const __u64 *ptrs,
+                                   const __u64 *keys, const __u64 *ptrs, int n)
-                                   int n, __u64 low, __u64 high)
 {
        struct buffer_head *bh;
        union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
@@ -1725,7 +1803,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
        if (ret < 0)
                return ret;
        nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
-                                              low, high, di, ni, bh);
+                                              di, ni, bh);
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
        return 0;
 }
@@ -1754,9 +1832,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                nilfs_btree_node_get_ptr(btree, parent,
                                         path[level + 1].bp_index);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-        ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
-                                        &path[level].bp_oldreq,
+                                          &path[level].bp_oldreq,
-                                        &path[level].bp_newreq);
+                                          &path[level].bp_newreq);
        if (ret < 0)
                return ret;
@@ -1768,9 +1846,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0) {
-                        nilfs_bmap_abort_update(&btree->bt_bmap,
+                        nilfs_bmap_abort_update_v(&btree->bt_bmap,
-                                                &path[level].bp_oldreq,
+                                                  &path[level].bp_oldreq,
-                                                &path[level].bp_newreq);
+                                                  &path[level].bp_newreq);
                        return ret;
                }
        }
@@ -1784,9 +1862,9 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 {
        struct nilfs_btree_node *parent;
-        nilfs_bmap_commit_update(&btree->bt_bmap,
+        nilfs_bmap_commit_update_v(&btree->bt_bmap,
-                                 &path[level].bp_oldreq,
+                                   &path[level].bp_oldreq,
-                                 &path[level].bp_newreq);
+                                   &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
@@ -1805,9 +1883,9 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
                                       struct nilfs_btree_path *path,
                                       int level)
 {
-        nilfs_bmap_abort_update(&btree->bt_bmap,
+        nilfs_bmap_abort_update_v(&btree->bt_bmap,
-                                &path[level].bp_oldreq,
+                                  &path[level].bp_oldreq,
-                                &path[level].bp_newreq);
+                                  &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1930,7 +2008,9 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+        ret = NILFS_BMAP_USE_VBN(bmap) ?
+                nilfs_btree_propagate_v(btree, path, level, bh) :
+                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
        nilfs_btree_clear_path(btree, path);
@@ -2066,12 +2146,9 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
        ptr = nilfs_btree_node_get_ptr(btree, parent,
                                       path[level + 1].bp_index);
        req.bpr_ptr = ptr;
-        ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
-                                                               &req);
+        if (unlikely(ret < 0))
-        if (ret < 0)
                return ret;
-        btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
-                                                        &req, blocknr);
        key = nilfs_btree_node_get_key(btree, parent,
                                       path[level + 1].bp_index);
@@ -2114,8 +2191,9 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = btree->bt_ops->btop_assign(btree, path, level, bh,
+        ret = NILFS_BMAP_USE_VBN(bmap) ?
-                                            blocknr, binfo);
+                nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
+                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
        nilfs_btree_clear_path(btree, path);
@@ -2171,7 +2249,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                WARN_ON(ret == -ENOENT);
                goto out;
        }
-        ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+        ret = nilfs_btree_get_block(btree, ptr, &bh);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
@@ -2179,7 +2257,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        if (!buffer_dirty(bh))
                nilfs_btnode_mark_dirty(bh);
-        nilfs_bmap_put_block(&btree->bt_bmap, bh);
+        brelse(bh);
        if (!nilfs_bmap_dirty(&btree->bt_bmap))
                nilfs_bmap_set_dirty(&btree->bt_bmap);
@@ -2191,6 +2269,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 static const struct nilfs_bmap_operations nilfs_btree_ops = {
        .bop_lookup             =       nilfs_btree_lookup,
+        .bop_lookup_contig      =       nilfs_btree_lookup_contig,
        .bop_insert             =       nilfs_btree_insert,
        .bop_delete             =       nilfs_btree_delete,
        .bop_clear              =       NULL,
@@ -2210,6 +2289,7 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = {
 static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
        .bop_lookup             =       NULL,
+        .bop_lookup_contig      =       NULL,
        .bop_insert             =       NULL,
        .bop_delete             =       NULL,
        .bop_clear              =       NULL,
@@ -2227,43 +2307,13 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
        .bop_gather_data        =       NULL,
 };
-static const struct nilfs_btree_operations nilfs_btree_ops_v = {
+int nilfs_btree_init(struct nilfs_bmap *bmap)
-        .btop_find_target       =       nilfs_btree_find_target_v,
-        .btop_set_target        =       nilfs_btree_set_target_v,
-        .btop_propagate         =       nilfs_btree_propagate_v,
-        .btop_assign            =       nilfs_btree_assign_v,
-};
-static const struct nilfs_btree_operations nilfs_btree_ops_p = {
-        .btop_find_target       =       NULL,
-        .btop_set_target        =       NULL,
-        .btop_propagate         =       nilfs_btree_propagate_p,
-        .btop_assign            =       nilfs_btree_assign_p,
-};
-int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
 {
-        struct nilfs_btree *btree;
-        btree = (struct nilfs_btree *)bmap;
        bmap->b_ops = &nilfs_btree_ops;
-        bmap->b_low = low;
-        bmap->b_high = high;
-        switch (bmap->b_inode->i_ino) {
-        case NILFS_DAT_INO:
-                btree->bt_ops = &nilfs_btree_ops_p;
-                break;
-        default:
-                btree->bt_ops = &nilfs_btree_ops_v;
-                break;
-        }
        return 0;
 }
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
 {
-        bmap->b_low = NILFS_BMAP_LARGE_LOW;
-        bmap->b_high = NILFS_BMAP_LARGE_HIGH;
        bmap->b_ops = &nilfs_btree_ops_gc;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4766deb52fb1..0e72bbbc6b64 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
 struct nilfs_btree_path;
 /**
- * struct nilfs_btree_operations - B-tree operation table
- */
-struct nilfs_btree_operations {
-        __u64 (*btop_find_target)(const struct nilfs_btree *,
-                                  const struct nilfs_btree_path *, __u64);
-        void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
-        struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
-        int (*btop_propagate)(struct nilfs_btree *,
-                              struct nilfs_btree_path *,
-                              int,
-                              struct buffer_head *);
-        int (*btop_assign)(struct nilfs_btree *,
-                           struct nilfs_btree_path *,
-                           int,
-                           struct buffer_head **,
-                           sector_t,
-                           union nilfs_binfo *);
-};
-/**
 * struct nilfs_btree_node - B-tree node
 * @bn_flags: flags
 * @bn_level: level
@@ -80,13 +58,9 @@ struct nilfs_btree_node {
 /**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
- * @bt_ops: B-tree operation table
 */
 struct nilfs_btree {
        struct nilfs_bmap bt_bmap;
-        /* B-tree-specific members */
-        const struct nilfs_btree_operations *bt_ops;
 };
@@ -108,10 +82,9 @@ struct nilfs_btree {
 int nilfs_btree_path_cache_init(void);
 void nilfs_btree_path_cache_destroy(void);
-int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
-                                   const __u64 *, const __u64 *,
+                                   const __u64 *, const __u64 *, int);
-                                   int, __u64, __u64);
 void nilfs_btree_init_gc(struct nilfs_bmap *);
 #endif  /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index cadd36b14d07..aec942cf79e3 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -295,10 +295,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                return -EINVAL;
        }
-        /* cannot delete the latest checkpoint */
-        if (start == nilfs_mdt_cno(cpfile) - 1)
-                return -EPERM;
        down_write(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
@@ -311,7 +307,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
                if (ret < 0) {
                        if (ret != -ENOENT)
-                                goto out_header;
+                                break;
                        /* skip hole */
                        ret = 0;
                        continue;
@@ -344,7 +340,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                                        continue;
                                printk(KERN_ERR "%s: cannot delete block\n",
                                       __func__);
-                                goto out_header;
+                                break;
                        }
                }
@@ -362,7 +358,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                kunmap_atomic(kaddr, KM_USER0);
        }
- out_header:
        brelse(header_bh);
 out_sem:
@@ -384,9 +379,10 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
 }
 static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
-                                          struct nilfs_cpinfo *ci, size_t nci)
+                                          void *buf, unsigned cisz, size_t nci)
 {
        struct nilfs_checkpoint *cp;
+        struct nilfs_cpinfo *ci = buf;
        struct buffer_head *bh;
        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
        __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
@@ -410,17 +406,22 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
                kaddr = kmap_atomic(bh->b_page, KM_USER0);
                cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
                for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
-                        if (!nilfs_checkpoint_invalid(cp))
+                        if (!nilfs_checkpoint_invalid(cp)) {
-                                nilfs_cpfile_checkpoint_to_cpinfo(
+                                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
-                                        cpfile, cp, &ci[n++]);
+                                                                  ci);
+                                ci = (void *)ci + cisz;
+                                n++;
+                        }
                }
                kunmap_atomic(kaddr, KM_USER0);
                brelse(bh);
        }
        ret = n;
-        if (n > 0)
+        if (n > 0) {
-                *cnop = ci[n - 1].ci_cno + 1;
+                ci = (void *)ci - cisz;
+                *cnop = ci->ci_cno + 1;
+        }
 out:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
@@ -428,11 +429,12 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 }
 static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
-                                          struct nilfs_cpinfo *ci, size_t nci)
+                                          void *buf, unsigned cisz, size_t nci)
 {
        struct buffer_head *bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
+        struct nilfs_cpinfo *ci = buf;
        __u64 curr = *cnop, next;
        unsigned long curr_blkoff, next_blkoff;
        void *kaddr;
@@ -472,7 +474,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
                if (unlikely(nilfs_checkpoint_invalid(cp) ||
                             !nilfs_checkpoint_snapshot(cp)))
                        break;
-                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
+                ci = (void *)ci + cisz;
+                n++;
                next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
                if (next == 0)
                        break; /* reach end of the snapshot list */
@@ -511,13 +515,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 */
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
-                                struct nilfs_cpinfo *ci, size_t nci)
+                                void *buf, unsigned cisz, size_t nci)
 {
        switch (mode) {
        case NILFS_CHECKPOINT:
-                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
        case NILFS_SNAPSHOT:
-                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
        default:
                return -EINVAL;
        }
@@ -533,20 +537,14 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
        struct nilfs_cpinfo ci;
        __u64 tcno = cno;
        ssize_t nci;
-        int ret;
-        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
        if (nci < 0)
                return nci;
        else if (nci == 0 || ci.ci_cno != cno)
                return -ENOENT;
+        else if (nilfs_cpinfo_snapshot(&ci))
-        /* cannot delete the latest checkpoint nor snapshots */
+                return -EBUSY;
-        ret = nilfs_cpinfo_snapshot(&ci);
-        if (ret < 0)
-                return ret;
-        else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
-                return -EPERM;
        return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
 }
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 1a8a1008c342..788a45950197 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -39,7 +39,7 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
 int nilfs_cpfile_is_snapshot(struct inode *, __u64);
 int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
-                                struct nilfs_cpinfo *, size_t);
+                                size_t);
 #endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index bb8a5818e7f1..8927ca27e6f7 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -92,21 +92,6 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_palloc_abort_alloc_entry(dat, req);
 }
-int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-        int ret;
-        ret = nilfs_palloc_prepare_free_entry(dat, req);
-        if (ret < 0)
-                return ret;
-        ret = nilfs_dat_prepare_entry(dat, req, 0);
-        if (ret < 0) {
-                nilfs_palloc_abort_free_entry(dat, req);
-                return ret;
-        }
-        return 0;
-}
 void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
@@ -149,15 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
                                             req->pr_entry_bh, kaddr);
        entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
-        if (entry->de_blocknr != cpu_to_le64(0) ||
-            entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
-                printk(KERN_CRIT
-                       "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
-                       __func__, (unsigned long long)req->pr_entry_nr,
-                       (unsigned long long)le64_to_cpu(entry->de_start),
-                       (unsigned long long)le64_to_cpu(entry->de_end),
-                       (unsigned long long)le64_to_cpu(entry->de_blocknr));
-        }
        entry->de_blocknr = cpu_to_le64(blocknr);
        kunmap_atomic(kaddr, KM_USER0);
@@ -391,36 +367,37 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
        return ret;
 }
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
                            size_t nvi)
 {
        struct buffer_head *entry_bh;
        struct nilfs_dat_entry *entry;
+        struct nilfs_vinfo *vinfo = buf;
        __u64 first, last;
        void *kaddr;
        unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
        int i, j, n, ret;
        for (i = 0; i < nvi; i += n) {
-                ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+                ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
                                                   0, &entry_bh);
                if (ret < 0)
                        return ret;
                kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
                /* last virtual block number in this block */
-                first = vinfo[i].vi_vblocknr;
+                first = vinfo->vi_vblocknr;
                do_div(first, entries_per_block);
                first *= entries_per_block;
                last = first + entries_per_block - 1;
                for (j = i, n = 0;
-                     j < nvi && vinfo[j].vi_vblocknr >= first &&
+                     j < nvi && vinfo->vi_vblocknr >= first &&
-                             vinfo[j].vi_vblocknr <= last;
+                             vinfo->vi_vblocknr <= last;
-                     j++, n++) {
+                     j++, n++, vinfo = (void *)vinfo + visz) {
                        entry = nilfs_palloc_block_get_entry(
-                                dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
+                                dat, vinfo->vi_vblocknr, entry_bh, kaddr);
-                        vinfo[j].vi_start = le64_to_cpu(entry->de_start);
+                        vinfo->vi_start = le64_to_cpu(entry->de_start);
-                        vinfo[j].vi_end = le64_to_cpu(entry->de_end);
+                        vinfo->vi_end = le64_to_cpu(entry->de_end);
-                        vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+                        vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
                }
                kunmap_atomic(kaddr, KM_USER0);
                brelse(entry_bh);
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d9560654a4b7..d328b81eead4 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -47,6 +47,6 @@ void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 #endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 54100acc1102..1a4fa04cf071 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -43,7 +43,6 @@
 */
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include "nilfs.h"
 #include "page.h"
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index c6379e482781..342d9765df8d 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -25,6 +25,7 @@
 #include "page.h"
 #include "direct.h"
 #include "alloc.h"
+#include "dat.h"
 static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
 {
@@ -62,6 +63,47 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
        return 0;
 }
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
+                                      __u64 key, __u64 *ptrp,
+                                      unsigned maxblocks)
+{
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+        struct inode *dat = NULL;
+        __u64 ptr, ptr2;
+        sector_t blocknr;
+        int ret, cnt;
+        if (key > NILFS_DIRECT_KEY_MAX ||
+            (ptr = nilfs_direct_get_ptr(direct, key)) ==
+            NILFS_BMAP_INVALID_PTR)
+                return -ENOENT;
+        if (NILFS_BMAP_USE_VBN(bmap)) {
+                dat = nilfs_bmap_get_dat(bmap);
+                ret = nilfs_dat_translate(dat, ptr, &blocknr);
+                if (ret < 0)
+                        return ret;
+                ptr = blocknr;
+        }
+        maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+        for (cnt = 1; cnt < maxblocks &&
+                     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
+                     NILFS_BMAP_INVALID_PTR;
+             cnt++) {
+                if (dat) {
+                        ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+                        if (ret < 0)
+                                return ret;
+                        ptr2 = blocknr;
+                }
+                if (ptr2 != ptr + cnt)
+                        break;
+        }
+        *ptrp = ptr;
+        return cnt;
+}
 static __u64
 nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
 {
@@ -90,10 +132,9 @@ static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
 {
        int ret;
-        if (direct->d_ops->dop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
+                req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
-        ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
+        ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
-                                                               req);
        if (ret < 0)
                return ret;
@@ -111,16 +152,14 @@ static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
        bh = (struct buffer_head *)((unsigned long)ptr);
        set_buffer_nilfs_volatile(bh);
-        if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
+        nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
-                direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
-                        &direct->d_bmap, req);
        nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
        if (!nilfs_bmap_dirty(&direct->d_bmap))
                nilfs_bmap_set_dirty(&direct->d_bmap);
-        if (direct->d_ops->dop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+                nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
 }
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -152,25 +191,18 @@ static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
 {
        int ret;
-        if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+        req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
-                req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+        ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
-                ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
+        if (!ret)
-                        &direct->d_bmap, req);
+                stats->bs_nblocks = 1;
-                if (ret < 0)
+        return ret;
-                        return ret;
-        }
-        stats->bs_nblocks = 1;
-        return 0;
 }
 static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
                                       union nilfs_bmap_ptr_req *req,
                                       __u64 key)
 {
-        if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
+        nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
-                direct->d_bmap.b_pops->bpop_commit_end_ptr(
-                        &direct->d_bmap, req);
        nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
 }
@@ -244,8 +276,7 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
 }
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
-                                    __u64 key, __u64 *keys, __u64 *ptrs,
+                                    __u64 key, __u64 *keys, __u64 *ptrs, int n)
-                                    int n, __u64 low, __u64 high)
 {
        struct nilfs_direct *direct;
        __le64 *dptrs;
@@ -275,8 +306,7 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                        dptrs[i] = NILFS_BMAP_INVALID_PTR;
        }
-        nilfs_direct_init(bmap, low, high);
+        nilfs_direct_init(bmap);
        return 0;
 }
@@ -293,11 +323,11 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
        if (!buffer_nilfs_volatile(bh)) {
                oldreq.bpr_ptr = ptr;
                newreq.bpr_ptr = ptr;
-                ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
+                ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
-                                                &newreq);
+                                                  &newreq);
                if (ret < 0)
                        return ret;
-                nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+                nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
                set_buffer_nilfs_volatile(bh);
                nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
        } else
@@ -309,12 +339,10 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
 static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
                                  struct buffer_head *bh)
 {
-        struct nilfs_direct *direct;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-        direct = (struct nilfs_direct *)bmap;
+        return NILFS_BMAP_USE_VBN(bmap) ?
-        return (direct->d_ops->dop_propagate != NULL) ?
+                nilfs_direct_propagate_v(direct, bh) : 0;
-                direct->d_ops->dop_propagate(direct, bh) :
-                0;
 }
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
@@ -327,12 +355,9 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
        int ret;
        req.bpr_ptr = ptr;
-        ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
+        ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
-                &direct->d_bmap, &req);
+        if (unlikely(ret < 0))
-        if (ret < 0)
                return ret;
-        direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
-                                                     &req, blocknr);
        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -377,12 +402,14 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
                return -EINVAL;
        }
-        return direct->d_ops->dop_assign(direct, key, ptr, bh,
+        return NILFS_BMAP_USE_VBN(bmap) ?
-                                         blocknr, binfo);
+                nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
+                nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
 }
 static const struct nilfs_bmap_operations nilfs_direct_ops = {
        .bop_lookup             =       nilfs_direct_lookup,
+        .bop_lookup_contig      =       nilfs_direct_lookup_contig,
        .bop_insert             =       nilfs_direct_insert,
        .bop_delete             =       nilfs_direct_delete,
        .bop_clear              =       NULL,
@@ -401,36 +428,8 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = {
 };
-static const struct nilfs_direct_operations nilfs_direct_ops_v = {
+int nilfs_direct_init(struct nilfs_bmap *bmap)
-        .dop_find_target        =       nilfs_direct_find_target_v,
-        .dop_set_target         =       nilfs_direct_set_target_v,
-        .dop_propagate          =       nilfs_direct_propagate_v,
-        .dop_assign             =       nilfs_direct_assign_v,
-};
-static const struct nilfs_direct_operations nilfs_direct_ops_p = {
-        .dop_find_target        =       NULL,
-        .dop_set_target         =       NULL,
-        .dop_propagate          =       NULL,
-        .dop_assign             =       nilfs_direct_assign_p,
-};
-int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
 {
-        struct nilfs_direct *direct;
-        direct = (struct nilfs_direct *)bmap;
        bmap->b_ops = &nilfs_direct_ops;
-        bmap->b_low = low;
-        bmap->b_high = high;
-        switch (bmap->b_inode->i_ino) {
-        case NILFS_DAT_INO:
-                direct->d_ops = &nilfs_direct_ops_p;
-                break;
-        default:
-                direct->d_ops = &nilfs_direct_ops_v;
-                break;
-        }
        return 0;
 }
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 45d2c5cda812..a5ffd66e25d0 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -31,18 +31,6 @@
 struct nilfs_direct;
 /**
- * struct nilfs_direct_operations - direct mapping operation table
- */
-struct nilfs_direct_operations {
-        __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
-        void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
-        int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
-        int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
-                          struct buffer_head **, sector_t,
-                          union nilfs_binfo *);
-};
-/**
 * struct nilfs_direct_node - direct node
 * @dn_flags: flags
 * @dn_pad: padding
@@ -55,13 +43,9 @@ struct nilfs_direct_node {
 /**
 * struct nilfs_direct - direct mapping
 * @d_bmap: bmap structure
- * @d_ops: direct mapping operation table
 */
 struct nilfs_direct {
        struct nilfs_bmap d_bmap;
-        /* direct-mapping-specific members */
-        const struct nilfs_direct_operations *d_ops;
 };
@@ -70,9 +54,9 @@ struct nilfs_direct {
 #define NILFS_DIRECT_KEY_MAX    (NILFS_DIRECT_NBLOCKS - 1)
-int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_init(struct nilfs_bmap *);
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
-                                    __u64 *, int, __u64, __u64);
+                                    __u64 *, int);
 #endif  /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 19d2102b6a69..1b3c2bb20da9 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,8 +52,9 @@
 #include "dat.h"
 #include "ifile.h"
-static struct address_space_operations def_gcinode_aops = {};
+static struct address_space_operations def_gcinode_aops = {
-/* XXX need def_gcinode_iops/fops? */
+        .sync_page              = block_sync_page,
+};
 /*
 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 49ab4a49bb4f..fe9d8f2a13f8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -43,22 +43,23 @@
 *
 * This function does not issue actual read request of the specified data
 * block. It is done by VFS.
- * Bulk read for direct-io is not supported yet. (should be supported)
 */
 int nilfs_get_block(struct inode *inode, sector_t blkoff,
                    struct buffer_head *bh_result, int create)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        unsigned long blknum = 0;
+        __u64 blknum = 0;
        int err = 0, ret;
        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
-        /* This exclusion control is a workaround; should be revised */
+        down_read(&NILFS_MDT(dat)->mi_sem);
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
-        ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
+        up_read(&NILFS_MDT(dat)->mi_sem);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        if (ret >= 0) { /* found */
-        if (ret == 0) { /* found */
                map_bh(bh_result, inode->i_sb, blknum);
+                if (ret > 0)
+                        bh_result->b_size = (ret << inode->i_blkbits);
                goto out;
        }
        /* data block was not found */
@@ -240,7 +241,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 struct address_space_operations nilfs_aops = {
        .writepage              = nilfs_writepage,
        .readpage               = nilfs_readpage,
-        /* .sync_page           = nilfs_sync_page, */
+        .sync_page              = block_sync_page,
        .writepages             = nilfs_writepages,
        .set_page_dirty         = nilfs_set_page_dirty,
        .readpages              = nilfs_readpages,
@@ -249,6 +250,7 @@ struct address_space_operations nilfs_aops = {
        /* .releasepage         = nilfs_releasepage, */
        .invalidatepage         = block_invalidatepage,
        .direct_IO              = nilfs_direct_IO,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
@@ -307,10 +309,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        /* ii->i_file_acl = 0; */
        /* ii->i_dir_acl = 0; */
        ii->i_dir_start_lookup = 0;
-#ifdef CONFIG_NILFS_FS_POSIX_ACL
-        ii->i_acl = NULL;
-        ii->i_default_acl = NULL;
-#endif
        ii->i_cno = 0;
        nilfs_set_inode_flags(inode);
        spin_lock(&sbi->s_next_gen_lock);
@@ -432,10 +430,6 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
-#ifdef CONFIG_NILFS_FS_POSIX_ACL
-        ii->i_acl = NILFS_ACL_NOT_CACHED;
-        ii->i_default_acl = NILFS_ACL_NOT_CACHED;
-#endif
        if (nilfs_read_inode_common(inode, raw_inode))
                goto failed_unmap;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6759b92006f..6ea5f872e2de 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -152,7 +152,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        down_read(&nilfs->ns_segctor_sem);
        ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
-                                      nmembs);
+                                      size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -182,7 +182,8 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+        ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size,
+                                      nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -212,7 +213,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -435,24 +436,6 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
        return nmembs;
 }
-static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
-                                     struct nilfs_argv *argv, void *buf)
-{
-        size_t nmembs = argv->v_nmembs;
-        struct nilfs_sb_info *sbi = nilfs->ns_writer;
-        int ret;
-        if (unlikely(!sbi)) {
-                /* never happens because called for a writable mount */
-                WARN_ON(1);
-                return -EROFS;
-        }
-        ret = nilfs_segctor_add_segments_to_be_freed(
-                NILFS_SC(sbi), buf, nmembs);
-        return (ret < 0) ? ret : nmembs;
-}
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
                                       struct nilfs_argv *argv, void **kbufs)
 {
@@ -491,14 +474,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
                msg = "cannot mark copying blocks dirty";
                goto failed;
        }
-        ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
-        if (ret < 0) {
-                /*
-                 * can safely abort because this operation is atomic.
-                 */
-                msg = "cannot set segments to be freed";
-                goto failed;
-        }
        return 0;
 failed:
@@ -615,7 +590,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
        if (copy_from_user(&argv, argp, sizeof(argv)))
                return -EFAULT;
-        if (argv.v_size != membsz)
+        if (argv.v_size < membsz)
                return -EINVAL;
        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index bb78745a0e30..2dfd47714ae5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -412,8 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
                return 0; /* Do not request flush for shadow page cache */
        if (!sb) {
                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
-                if (!writer)
+                if (!writer) {
+                        nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
                        return -EROFS;
+                }
                sb = writer->s_super;
        }
@@ -430,6 +432,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 static struct address_space_operations def_mdt_aops = {
        .writepage              = nilfs_mdt_write_page,
+        .sync_page              = block_sync_page,
 };
 static struct inode_operations def_mdt_iops;
@@ -449,7 +452,7 @@ struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
                     ino_t ino, gfp_t gfp_mask)
 {
-        struct inode *inode = nilfs_alloc_inode(sb);
+        struct inode *inode = nilfs_alloc_inode_common(nilfs);
        if (!inode)
                return NULL;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index da6fc0bba2e5..724c63766e82 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -58,10 +58,6 @@ struct nilfs_inode_info {
         */
        struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_NILFS_POSIX_ACL
-        struct posix_acl *i_acl;
-        struct posix_acl *i_default_acl;
-#endif
        struct buffer_head *i_bh;       /* i_bh contains a new or dirty
                                           disk inode */
        struct inode vfs_inode;
@@ -263,6 +259,7 @@ extern void nilfs_dirty_inode(struct inode *);
 extern struct dentry *nilfs_get_parent(struct dentry *);
 /* super.c */
+extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 57afa9d24061..d80cc71be749 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -28,7 +28,6 @@
 #include "segment.h"
 #include "sufile.h"
 #include "page.h"
-#include "seglist.h"
 #include "segbuf.h"
 /*
@@ -395,6 +394,24 @@ static void dispose_recovery_list(struct list_head *head)
        }
 }
+struct nilfs_segment_entry {
+        struct list_head        list;
+        __u64                   segnum;
+};
+static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
+{
+        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+        if (unlikely(!ent))
+                return -ENOMEM;
+        ent->segnum = segnum;
+        INIT_LIST_HEAD(&ent->list);
+        list_add_tail(&ent->list, head);
+        return 0;
+}
 void nilfs_dispose_segment_list(struct list_head *head)
 {
        while (!list_empty(head)) {
@@ -402,7 +419,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
                        = list_entry(head->next,
                                     struct nilfs_segment_entry, list);
                list_del(&ent->list);
-                nilfs_free_segment_entry(ent);
+                kfree(ent);
        }
 }
@@ -431,12 +448,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        if (unlikely(err))
                goto failed;
-        err = -ENOMEM;
        for (i = 1; i < 4; i++) {
-                ent = nilfs_alloc_segment_entry(segnum[i]);
+                err = nilfs_segment_list_add(head, segnum[i]);
-                if (unlikely(!ent))
+                if (unlikely(err))
                        goto failed;
-                list_add_tail(&ent->list, head);
        }
        /*
@@ -450,7 +465,7 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
                                goto failed;
                }
                list_del(&ent->list);
-                nilfs_free_segment_entry(ent);
+                kfree(ent);
        }
        /* Allocate new segments for recovery */
@@ -791,7 +806,6 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
-        struct nilfs_segment_entry *ent;
        LIST_HEAD(segments);
        int empty_seg = 0, scan_newer = 0;
        int ret;
@@ -892,12 +906,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                if (empty_seg++)
                        goto super_root_found; /* found a valid super root */
-                ent = nilfs_alloc_segment_entry(segnum);
+                ret = nilfs_segment_list_add(&segments, segnum);
-                if (unlikely(!ent)) {
+                if (unlikely(ret))
-                        ret = -ENOMEM;
                        goto failed;
-                }
-                list_add_tail(&ent->list, &segments);
                seg_seq++;
                segnum = nextnum;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e68821b4a9b..9e3fe17bb96b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -26,7 +26,6 @@
 #include <linux/crc32.h>
 #include "page.h"
 #include "segbuf.h"
-#include "seglist.h"
 static struct kmem_cache *nilfs_segbuf_cachep;
@@ -394,7 +393,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                 * Last BIO is always sent through the following
                 * submission.
                 */
-                rw |= (1 << BIO_RW_SYNCIO);
+                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
                res = nilfs_submit_seg_bio(wi, rw);
                if (unlikely(res))
                        goto failed_bio;
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
deleted file mode 100644
index d39df9144e99..000000000000
--- a/fs/nilfs2/seglist.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * seglist.h - expediential structure and routines to handle list of segments
- *             (would be removed in a future release)
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
- *
- */
-#ifndef _NILFS_SEGLIST_H
-#define _NILFS_SEGLIST_H
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
-#include "sufile.h"
-struct nilfs_segment_entry {
-        __u64                   segnum;
-#define NILFS_SLH_FREED         0x0001  /* The segment was freed provisonally.
-                                           It must be cancelled if
-                                           construction aborted */
-        unsigned                flags;
-        struct list_head        list;
-        struct buffer_head     *bh_su;
-        struct nilfs_segment_usage *raw_su;
-};
-void nilfs_dispose_segment_list(struct list_head *);
-static inline struct nilfs_segment_entry *
-nilfs_alloc_segment_entry(__u64 segnum)
-{
-        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
-        if (likely(ent)) {
-                ent->segnum = segnum;
-                ent->flags = 0;
-                ent->bh_su = NULL;
-                ent->raw_su = NULL;
-                INIT_LIST_HEAD(&ent->list);
-        }
-        return ent;
-}
-static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
-                                           struct inode *sufile)
-{
-        return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
-                                              &ent->raw_su, &ent->bh_su);
-}
-static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
-                                             struct inode *sufile)
-{
-        if (!ent->bh_su)
-                return;
-        nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
-        ent->bh_su = NULL;
-        ent->raw_su = NULL;
-}
-static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
-{
-        kfree(ent);
-}
-#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 22c7f65c2403..51ff3d0a4ee2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -39,7 +39,6 @@
 #include "sufile.h"
 #include "cpfile.h"
 #include "ifile.h"
-#include "seglist.h"
 #include "segbuf.h"
@@ -79,7 +78,8 @@ enum {
 /* State flags of collection */
 #define NILFS_CF_NODE           0x0001  /* Collecting node blocks */
 #define NILFS_CF_IFILE_STARTED  0x0002  /* IFILE stage has started */
-#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED)
+#define NILFS_CF_SUFREED        0x0004  /* segment usages has been freed */
+#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)
 /* Operations depending on the construction mode and file type */
 struct nilfs_sc_operations {
@@ -810,7 +810,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 {
        return list_empty(&sci->sc_dirty_files) &&
                !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
-                list_empty(&sci->sc_cleaning_segments) &&
+                sci->sc_nfreesegs == 0 &&
                (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
 }
@@ -1005,44 +1005,6 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
        }
 }
-static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
-                                               struct inode *sufile)
-{
-        struct list_head *head = &sci->sc_cleaning_segments;
-        struct nilfs_segment_entry *ent;
-        int err;
-        list_for_each_entry(ent, head, list) {
-                if (!(ent->flags & NILFS_SLH_FREED))
-                        break;
-                err = nilfs_sufile_cancel_free(sufile, ent->segnum);
-                WARN_ON(err); /* do not happen */
-                ent->flags &= ~NILFS_SLH_FREED;
-        }
-}
-static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
-                                               struct inode *sufile)
-{
-        struct list_head *head = &sci->sc_cleaning_segments;
-        struct nilfs_segment_entry *ent;
-        int err;
-        list_for_each_entry(ent, head, list) {
-                err = nilfs_sufile_free(sufile, ent->segnum);
-                if (unlikely(err))
-                        return err;
-                ent->flags |= NILFS_SLH_FREED;
-        }
-        return 0;
-}
-static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
-{
-        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
 static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
                                       struct inode *inode,
                                       struct list_head *listp,
@@ -1161,6 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct list_head *head;
        struct nilfs_inode_info *ii;
+        size_t ndone;
        int err = 0;
        switch (sci->sc_stage.scnt) {
@@ -1250,10 +1213,16 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                        break;
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_SUFILE:
-                err = nilfs_segctor_prepare_free_segments(sci,
+                err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
-                                                          nilfs->ns_sufile);
+                                         sci->sc_nfreesegs, &ndone);
-                if (unlikely(err))
+                if (unlikely(err)) {
+                        nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                  sci->sc_freesegs, ndone,
+                                                  NULL);
                        break;
+                }
+                sci->sc_stage.flags |= NILFS_CF_SUFREED;
                err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
@@ -1486,7 +1455,15 @@ static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
 {
        if (unlikely(err)) {
                nilfs_segctor_free_incomplete_segments(sci, nilfs);
-                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                        int ret;
+                        ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                        sci->sc_freesegs,
+                                                        sci->sc_nfreesegs,
+                                                        NULL);
+                        WARN_ON(ret); /* do not happen */
+                }
        }
        nilfs_segctor_clear_segment_buffers(sci);
 }
@@ -1585,7 +1562,13 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
                        break;
-                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                        sci->sc_freesegs,
+                                                        sci->sc_nfreesegs,
+                                                        NULL);
+                        WARN_ON(err); /* do not happen */
+                }
                nilfs_segctor_clear_segment_buffers(sci);
                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
@@ -1846,26 +1829,13 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
                err = nilfs_segbuf_write(segbuf, &wi);
                res = nilfs_segbuf_wait(segbuf, &wi);
-                err = unlikely(err) ? : res;
+                err = err ? : res;
-                if (unlikely(err))
+                if (err)
                        return err;
        }
        return 0;
 }
-static int nilfs_page_has_uncleared_buffer(struct page *page)
-{
-        struct buffer_head *head, *bh;
-        head = bh = page_buffers(page);
-        do {
-                if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
-                        return 1;
-                bh = bh->b_this_page;
-        } while (bh != head);
-        return 0;
-}
 static void __nilfs_end_page_io(struct page *page, int err)
 {
        if (!err) {
@@ -1889,13 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
        if (!page)
                return;
-        if (buffer_nilfs_node(page_buffers(page)) &&
+        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
-            nilfs_page_has_uncleared_buffer(page))
+                /*
-                /* For b-tree node pages, this function may be called twice
+                 * For b-tree node pages, this function may be called twice
-                   or more because they might be split in a segment.
+                 * or more because they might be split in a segment.
-                   This check assures that cleanup has been done for all
+                 */
-                   buffers in a split btnode page. */
+                if (PageDirty(page)) {
+                        /*
+                         * For pages holding split b-tree node buffers, dirty
+                         * flag on the buffers may be cleared discretely.
+                         * In that case, the page is once redirtied for
+                         * remaining buffers, and it must be cancelled if
+                         * all the buffers get cleaned later.
+                         */
+                        lock_page(page);
+                        if (nilfs_page_buffers_clean(page))
+                                __nilfs_clear_page_dirty(page);
+                        unlock_page(page);
+                }
                return;
+        }
        __nilfs_end_page_io(page, err);
 }
@@ -1957,7 +1940,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                        }
                        if (bh->b_page != fs_page) {
                                nilfs_end_page_io(fs_page, err);
-                                if (unlikely(fs_page == failed_page))
+                                if (fs_page && fs_page == failed_page)
                                        goto done;
                                fs_page = bh->b_page;
                        }
@@ -2224,10 +2207,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                nilfs_segctor_complete_write(sci);
                /* Commit segments */
-                if (has_sr) {
+                if (has_sr)
-                        nilfs_segctor_commit_free_segments(sci);
                        nilfs_segctor_clear_metadata_dirty(sci);
-                }
                nilfs_segctor_end_construction(sci, nilfs, 0);
@@ -2301,48 +2282,6 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
                                        /* assign bit 0 to data files */
 }
-int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
-                                           __u64 *segnum, size_t nsegs)
-{
-        struct nilfs_segment_entry *ent;
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct inode *sufile = nilfs->ns_sufile;
-        LIST_HEAD(list);
-        __u64 *pnum;
-        size_t i;
-        int err;
-        for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
-                ent = nilfs_alloc_segment_entry(*pnum);
-                if (unlikely(!ent)) {
-                        err = -ENOMEM;
-                        goto failed;
-                }
-                list_add_tail(&ent->list, &list);
-                err = nilfs_open_segment_entry(ent, sufile);
-                if (unlikely(err))
-                        goto failed;
-                if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
-                        printk(KERN_WARNING "NILFS: unused segment is "
-                               "requested to be cleaned (segnum=%llu)\n",
-                               (unsigned long long)ent->segnum);
-                nilfs_close_segment_entry(ent, sufile);
-        }
-        list_splice(&list, sci->sc_cleaning_segments.prev);
-        return 0;
- failed:
-        nilfs_dispose_segment_list(&list);
-        return err;
-}
-void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
-{
-        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
 struct nilfs_segctor_wait_request {
        wait_queue_t    wq;
        __u32           seq;
@@ -2607,10 +2546,13 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        err = nilfs_init_gcdat_inode(nilfs);
        if (unlikely(err))
                goto out_unlock;
        err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
        if (unlikely(err))
                goto out_unlock;
+        sci->sc_freesegs = kbufs[4];
+        sci->sc_nfreesegs = argv[4].v_nmembs;
        list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
        for (;;) {
@@ -2629,6 +2571,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        }
 out_unlock:
+        sci->sc_freesegs = NULL;
+        sci->sc_nfreesegs = 0;
        nilfs_clear_gcdat_inode(nilfs);
        nilfs_transaction_unlock(sbi);
        return err;
@@ -2835,7 +2779,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        INIT_LIST_HEAD(&sci->sc_dirty_files);
        INIT_LIST_HEAD(&sci->sc_segbufs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
-        INIT_LIST_HEAD(&sci->sc_cleaning_segments);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2901,9 +2844,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
        }
-        if (!list_empty(&sci->sc_cleaning_segments))
-                nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
        WARN_ON(!list_empty(&sci->sc_segbufs));
        down_write(&sbi->s_nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 476bdd5df5be..0d2a475a741b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -90,8 +90,9 @@ struct nilfs_segsum_pointer {
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
- * @sc_cleaning_segments: List of segments to be freed through construction
 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_freesegs: array of segment numbers to be freed
+ * @sc_nfreesegs: number of segments on @sc_freesegs
 * @sc_dsync_inode: inode whose data pages are written for a sync operation
 * @sc_dsync_start: start byte offset of data pages
 * @sc_dsync_end: end byte offset of data pages (inclusive)
@@ -131,9 +132,11 @@ struct nilfs_sc_info {
        struct list_head        sc_dirty_files;
        struct list_head        sc_gc_inodes;
-        struct list_head        sc_cleaning_segments;
        struct list_head        sc_copied_buffers;
+        __u64                  *sc_freesegs;
+        size_t                  sc_nfreesegs;
        struct nilfs_inode_info *sc_dsync_inode;
        loff_t                  sc_dsync_start;
        loff_t                  sc_dsync_end;
@@ -225,10 +228,6 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
                                void **);
-extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
-                                                  __u64 *, size_t);
-extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
 extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
@@ -240,5 +239,6 @@ extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
 extern int nilfs_recover_logical_segments(struct the_nilfs *,
                                          struct nilfs_sb_info *,
                                          struct nilfs_recovery_info *);
+extern void nilfs_dispose_segment_list(struct list_head *);
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 98e68677f045..37994d4a59cc 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,6 +18,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Written by Koji Sato <koji@osrg.net>.
+ * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
 */
 #include <linux/kernel.h>
@@ -108,6 +109,102 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
        nilfs_mdt_mark_buffer_dirty(header_bh);
 }
+/**
+ * nilfs_sufile_updatev - modify multiple segment usages at a time
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @create: creation flag
+ * @ndone: place to store number of modified segments on @segnumv
+ * @dofunc: primitive operation for the update
+ *
+ * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
+ * against the given array of segments.  The @dofunc is called with
+ * buffers of a header block and the sufile block in which the target
+ * segment usage entry is contained.  If @ndone is given, the number
+ * of successfully modified segments from the head is stored in the
+ * place @ndone points to.
+ *
+ * Return Value: On success, zero is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Given segment usage is in hole block (may be returned if
+ *            @create is zero)
+ *
+ * %-EINVAL - Invalid segment usage number
+ */
+int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
+                         int create, size_t *ndone,
+                         void (*dofunc)(struct inode *, __u64,
+                                        struct buffer_head *,
+                                        struct buffer_head *))
+{
+        struct buffer_head *header_bh, *bh;
+        unsigned long blkoff, prev_blkoff;
+        __u64 *seg;
+        size_t nerr = 0, n = 0;
+        int ret = 0;
+        if (unlikely(nsegs == 0))
+                goto out;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        for (seg = segnumv; seg < segnumv + nsegs; seg++) {
+                if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
+                        printk(KERN_WARNING
+                               "%s: invalid segment number: %llu\n", __func__,
+                               (unsigned long long)*seg);
+                        nerr++;
+                }
+        }
+        if (nerr > 0) {
+                ret = -EINVAL;
+                goto out_sem;
+        }
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        seg = segnumv;
+        blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+        ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+        if (ret < 0)
+                goto out_header;
+        for (;;) {
+                dofunc(sufile, *seg, header_bh, bh);
+                if (++seg >= segnumv + nsegs)
+                        break;
+                prev_blkoff = blkoff;
+                blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+                if (blkoff == prev_blkoff)
+                        continue;
+                /* get different block */
+                brelse(bh);
+                ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+                if (unlikely(ret < 0))
+                        goto out_header;
+        }
+        brelse(bh);
+ out_header:
+        n = seg - segnumv;
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+ out:
+        if (ndone)
+                *ndone = n;
+        return ret;
+}
 int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
                        void (*dofunc)(struct inode *, __u64,
                                       struct buffer_head *,
@@ -490,7 +587,8 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 * nilfs_sufile_get_suinfo -
 * @sufile: inode of segment usage file
 * @segnum: segment number to start looking
- * @si: array of suinfo
+ * @buf: array of suinfo
+ * @sisz: byte size of suinfo
 * @nsi: size of suinfo array
 *
 * Description:
@@ -502,11 +600,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 *
 * %-ENOMEM - Insufficient amount of memory available.
 */
-ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
-                                struct nilfs_suinfo *si, size_t nsi)
+                                unsigned sisz, size_t nsi)
 {
        struct buffer_head *su_bh;
        struct nilfs_segment_usage *su;
+        struct nilfs_suinfo *si = buf;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
        void *kaddr;
@@ -531,20 +630,22 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
                        if (ret != -ENOENT)
                                goto out;
                        /* hole */
-                        memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+                        memset(si, 0, sisz * n);
+                        si = (void *)si + sisz * n;
                        continue;
                }
                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
                su = nilfs_sufile_block_get_segment_usage(
                        sufile, segnum, su_bh, kaddr);
-                for (j = 0; j < n; j++, su = (void *)su + susz) {
+                for (j = 0; j < n;
-                        si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
+                     j++, su = (void *)su + susz, si = (void *)si + sisz) {
-                        si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si->sui_lastmod = le64_to_cpu(su->su_lastmod);
-                        si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+                        si->sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si->sui_flags = le32_to_cpu(su->su_flags) &
                                ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                        if (nilfs_segment_is_active(nilfs, segnum + j))
-                                si[i + j].sui_flags |=
+                                si->sui_flags |=
                                        (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                }
                kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2e2efd4ade1..a2c4d76c3366 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -43,43 +43,27 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64,
                                    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
-ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
                                size_t);
+int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
+                         void (*dofunc)(struct inode *, __u64,
+                                        struct buffer_head *,
+                                        struct buffer_head *));
 int nilfs_sufile_update(struct inode *, __u64, int,
                        void (*dofunc)(struct inode *, __u64,
                                       struct buffer_head *,
                                       struct buffer_head *));
-void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
-                                 struct buffer_head *);
 void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
                           struct buffer_head *);
 void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
                          struct buffer_head *);
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+                                 struct buffer_head *);
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
 /**
- * nilfs_sufile_cancel_free -
- * @sufile: inode of segment usage file
- * @segnum: segment number
- *
- * Description:
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
-{
-        return nilfs_sufile_update(sufile, segnum, 0,
-                                   nilfs_sufile_do_cancel_free);
-}
-/**
 * nilfs_sufile_scrap - make a segment garbage
 * @sufile: inode of segment usage file
 * @segnum: segment number to be freed
@@ -100,6 +84,38 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
 }
 /**
+ * nilfs_sufile_freev - free segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of freed segments
+ */
+static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
+                                     size_t nsegs, size_t *ndone)
+{
+        return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+                                    nilfs_sufile_do_free);
+}
+/**
+ * nilfs_sufile_cancel_freev - reallocate freeing segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of cancelled segments
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error codes
+ * is returned.
+ */
+static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
+                                            __u64 *segnumv, size_t nsegs,
+                                            size_t *ndone)
+{
+        return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+                                    nilfs_sufile_do_cancel_free);
+}
+/**
 * nilfs_sufile_set_error - mark a segment as erroneous
 * @sufile: inode of segment usage file
 * @segnum: segment number
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1777a3467bd2..8e2ec43b18f4 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -133,7 +133,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
 static struct kmem_cache *nilfs_inode_cachep;
-struct inode *nilfs_alloc_inode(struct super_block *sb)
+struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
        struct nilfs_inode_info *ii;
@@ -143,10 +143,15 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        ii->i_bh = NULL;
        ii->i_state = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_btnode_cache_init(&ii->i_btnode_cache);
+        nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
        return &ii->vfs_inode;
 }
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+        return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
+}
 void nilfs_destroy_inode(struct inode *inode)
 {
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
@@ -184,16 +189,6 @@ static void nilfs_clear_inode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-#ifdef CONFIG_NILFS_POSIX_ACL
-        if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
-                posix_acl_release(ii->i_acl);
-                ii->i_acl = NILFS_ACL_NOT_CACHED;
-        }
-        if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
-                posix_acl_release(ii->i_default_acl);
-                ii->i_default_acl = NILFS_ACL_NOT_CACHED;
-        }
-#endif
        /*
         * Free resources allocated in nilfs_read_inode(), here.
         */
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index e4e5c78bcc93..8b8889825716 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -32,7 +32,6 @@
 #include "cpfile.h"
 #include "sufile.h"
 #include "dat.h"
-#include "seglist.h"
 #include "segbuf.h"
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 9b0efdad8910..477d37d83b31 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <asm/byteorder.h>
 static struct nls_table default_table;
 static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
    {0,                                                /* end of table    */}
 };
-int
+#define UNICODE_MAX     0x0010ffff
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+#define PLANE_SIZE      0x00010000
+#define SURROGATE_MASK  0xfffff800
+#define SURROGATE_PAIR  0x0000d800
+#define SURROGATE_LOW   0x00000400
+#define SURROGATE_BITS  0x000003ff
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 {
-        long l;
+        unsigned long l;
        int c0, c, nc;
        const struct utf8_table *t;
  
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
                nc++;
                if ((c0 & t->cmask) == t->cval) {
                        l &= t->lmask;
-                        if (l < t->lval)
+                        if (l < t->lval || l > UNICODE_MAX ||
+                                        (l & SURROGATE_MASK) == SURROGATE_PAIR)
                                return -1;
-                        *p = l;
+                        *pu = (unicode_t) l;
                        return nc;
                }
-                if (n <= nc)
+                if (len <= nc)
                        return -1;
                s++;
                c = (*s ^ 0x80) & 0xFF;
@@ -72,90 +81,133 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
        }
        return -1;
 }
+EXPORT_SYMBOL(utf8_to_utf32);
-int
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
 {
-        __u16 *op;
+        unsigned long l;
-        const __u8 *ip;
-        int size;
-        op = pwcs;
-        ip = s;
-        while (*ip && n > 0) {
-                if (*ip & 0x80) {
-                        size = utf8_mbtowc(op, ip, n);
-                        if (size == -1) {
-                                /* Ignore character and move on */
-                                ip++;
-                                n--;
-                        } else {
-                                op++;
-                                ip += size;
-                                n -= size;
-                        }
-                } else {
-                        *op++ = *ip++;
-                        n--;
-                }
-        }
-        return (op - pwcs);
-}
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
-        long l;
        int c, nc;
        const struct utf8_table *t;
-  
        if (!s)
                return 0;
-  
-        l = wc;
+        l = u;
+        if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+                return -1;
        nc = 0;
        for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
                nc++;
                if (l <= t->lmask) {
                        c = t->shift;
-                        *s = t->cval | (l >> c);
+                        *s = (u8) (t->cval | (l >> c));
                        while (c > 0) {
                                c -= 6;
                                s++;
-                                *s = 0x80 | ((l >> c) & 0x3F);
+                                *s = (u8) (0x80 | ((l >> c) & 0x3F));
                        }
                        return nc;
                }
        }
        return -1;
 }
+EXPORT_SYMBOL(utf32_to_utf8);
-int
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
 {
-        const __u16 *ip;
+        u16 *op;
-        __u8 *op;
        int size;
+        unicode_t u;
+        op = pwcs;
+        while (*s && len > 0) {
+                if (*s & 0x80) {
+                        size = utf8_to_utf32(s, len, &u);
+                        if (size < 0) {
+                                /* Ignore character and move on */
+                                size = 1;
+                        } else if (u >= PLANE_SIZE) {
+                                u -= PLANE_SIZE;
+                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                                ((u >> 10) & SURROGATE_BITS));
+                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                                SURROGATE_LOW |
+                                                (u & SURROGATE_BITS));
+                        } else {
+                                *op++ = (wchar_t) u;
+                        }
+                        s += size;
+                        len -= size;
+                } else {
+                        *op++ = *s++;
+                        len--;
+                }
+        }
+        return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+        switch (endian) {
+        default:
+                return c;
+        case UTF16_LITTLE_ENDIAN:
+                return __le16_to_cpu(c);
+        case UTF16_BIG_ENDIAN:
+                return __be16_to_cpu(c);
+        }
+}
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+                u8 *s, int maxlen)
+{
+        u8 *op;
+        int size;
+        unsigned long u, v;
        op = s;
-        ip = pwcs;
+        while (len > 0 && maxlen > 0) {
-        while (*ip && maxlen > 0) {
+                u = get_utf16(*pwcs, endian);
-                if (*ip > 0x7f) {
+                if (!u)
-                        size = utf8_wctomb(op, *ip, maxlen);
+                        break;
+                pwcs++;
+                len--;
+                if (u > 0x7f) {
+                        if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+                                if (u & SURROGATE_LOW) {
+                                        /* Ignore character and move on */
+                                        continue;
+                                }
+                                if (len <= 0)
+                                        break;
+                                v = get_utf16(*pwcs, endian);
+                                if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+                                                !(v & SURROGATE_LOW)) {
+                                        /* Ignore character and move on */
+                                        continue;
+                                }
+                                u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+                                                + (v & SURROGATE_BITS);
+                                pwcs++;
+                                len--;
+                        }
+                        size = utf32_to_utf8(u, op, maxlen);
                        if (size == -1) {
                                /* Ignore character and move on */
-                                maxlen--;
                        } else {
                                op += size;
                                maxlen -= size;
                        }
                } else {
-                        *op++ = (__u8) *ip;
+                        *op++ = (u8) u;
+                        maxlen--;
                }
-                ip++;
        }
-        return (op - s);
+        return op - s;
 }
+EXPORT_SYMBOL(utf16s_to_utf8s);
 int register_nls(struct nls_table * nls)
 {
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
 EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index aa2c42fdd977..0d60a44acacd 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 {
        int n;
-        if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+        if (boundlen <= 0)
+                return -ENAMETOOLONG;
+        n = utf32_to_utf8(uni, out, boundlen);
+        if (n < 0) {
                *out = '?';
                return -EINVAL;
        }
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
 {
        int n;
+        unicode_t u;
-        if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+        n = utf8_to_utf32(rawstring, boundlen, &u);
+        if (n < 0 || u > MAX_WCHAR_T) {
                *uni = 0x003f;  /* ? */
-                n = -EINVAL;
+                return -EINVAL;
        }
+        *uni = (wchar_t) u;
        return n;
 }
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f1..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
 config FSNOTIFY
-        bool "Filesystem notification backend"
+        def_bool n
-        default y
-        ---help---
-           fsnotify is a backend for filesystem notification.  fsnotify does
-           not provide any userspace interface but does provide the basis
-           needed for other notification schemes such as dnotify, inotify,
-           and fanotify.
-           Say Y here to enable fsnotify suport.
-           If unsure, say Y.
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405a..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd76818..037e878e03fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
                        if (!group->ops->should_send_event(group, to_tell, mask))
                                continue;
                        if (!event) {
-                                event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+                                event = fsnotify_create_event(to_tell, mask, data,
+                                                              data_is, file_name, cookie,
+                                                              GFP_KERNEL);
                                /* shit, we OOM'd and now we can't tell, maybe
                                 * someday someone else will want to do something
                                 * here */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a1..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ea2605a58b8a..f234f3a4c8ca 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -15,7 +15,8 @@ struct inotify_inode_mark_entry {
        int wd;
 };
-extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+                                           struct fsnotify_group *group);
 extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 7ef75b83247e..47cd258fd24d 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -81,7 +81,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
 {
-        inotify_destroy_mark_entry(entry, group);
+        inotify_ignored_and_remove_idr(entry, group);
 }
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 982a412ac5bc..f30d9bbc2e1b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
-static struct fsnotify_event *inotify_ignored_event;
 /*
 * When inotify registers a new group it increments this and uses that
@@ -296,12 +295,15 @@ static int inotify_fasync(int fd, struct file *file, int on)
 static int inotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
+        struct user_struct *user = group->inotify_data.user;
        fsnotify_clear_marks_by_group(group);
        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
        fsnotify_put_group(group);
+        atomic_dec(&user->inotify_devs);
        return 0;
 }
@@ -362,43 +364,38 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
        return error;
 }
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+                                    struct inotify_inode_mark_entry *ientry)
+{
+        struct idr *idr;
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        idr_remove(idr, ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        ientry->wd = -1;
+}
 /*
- * When, for whatever reason, inotify is done with a mark (or what used to be a
+ * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
- * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
+ * internal reference help on the mark because it is in the idr.
- * for the given wd.
- *
- * There is a bit of recursion here.  The loop looks like:
- *      inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
- *      inotify_freeing_mark -> inotify_destory_mark_entry -> restart
- * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
- * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
- * test below will not call back to fsnotify again.  But even if that test wasn't
- * there this would still be safe since fsnotify_destroy_mark_by_entry() is
- * safe from recursion.
 */
-void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+                                    struct fsnotify_group *group)
 {
        struct inotify_inode_mark_entry *ientry;
+        struct fsnotify_event *ignored_event;
        struct inotify_event_private_data *event_priv;
        struct fsnotify_event_private_data *fsn_event_priv;
-        struct fsnotify_group *egroup;
-        struct idr *idr;
-        spin_lock(&entry->lock);
-        egroup = entry->group;
-        /* if egroup we aren't really done and something might still send events
+        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
-         * for this inode, on the callback we'll send the IN_IGNORED */
+                                              FSNOTIFY_EVENT_NONE, NULL, 0,
-        if (egroup) {
+                                              GFP_NOFS);
-                spin_unlock(&entry->lock);
+        if (!ignored_event)
-                fsnotify_destroy_mark_by_entry(entry);
                return;
-        }
-        spin_unlock(&entry->lock);
        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
        if (unlikely(!event_priv))
                goto skip_send_ignore;
@@ -407,7 +404,7 @@ void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnoti
        fsn_event_priv->group = group;
        event_priv->wd = ientry->wd;
-        fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+        fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
        /* did the private data get added? */
        if (list_empty(&fsn_event_priv->event_list))
@@ -415,14 +412,16 @@ void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnoti
 skip_send_ignore:
+        /* matches the reference taken when the event was created */
+        fsnotify_put_event(ignored_event);
        /* remove this entry from the idr */
-        spin_lock(&group->inotify_data.idr_lock);
+        inotify_remove_from_idr(group, ientry);
-        idr = &group->inotify_data.idr;
-        idr_remove(idr, ientry->wd);
-        spin_unlock(&group->inotify_data.idr_lock);
        /* removed from idr, drop that reference */
        fsnotify_put_mark(entry);
+        atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 /* ding dong the mark is dead */
@@ -437,6 +436,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
        struct fsnotify_mark_entry *entry = NULL;
        struct inotify_inode_mark_entry *ientry;
+        struct inotify_inode_mark_entry *tmp_ientry;
        int ret = 0;
        int add = (arg & IN_MASK_ADD);
        __u32 mask;
@@ -447,54 +447,66 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
        if (unlikely(!mask))
                return -EINVAL;
-        ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-        if (unlikely(!ientry))
+        if (unlikely(!tmp_ientry))
                return -ENOMEM;
        /* we set the mask at the end after attaching it */
-        fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+        fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
-        ientry->wd = 0;
+        tmp_ientry->wd = -1;
 find_entry:
        spin_lock(&inode->i_lock);
        entry = fsnotify_find_mark_entry(group, inode);
        spin_unlock(&inode->i_lock);
        if (entry) {
-                kmem_cache_free(inotify_inode_mark_cachep, ientry);
                ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
        } else {
-                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+                ret = -ENOSPC;
-                        ret = -ENOSPC;
+                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
                        goto out_err;
-                }
-                ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
-                if (ret == -EEXIST)
-                        goto find_entry;
-                else if (ret)
-                        goto out_err;
-                entry = &ientry->fsn_entry;
 retry:
                ret = -ENOMEM;
                if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
                        goto out_err;
                spin_lock(&group->inotify_data.idr_lock);
-                /* if entry is added to the idr we keep the reference obtained
+                ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-                 * through fsnotify_mark_add.  remember to drop this reference
+                                        group->inotify_data.last_wd,
-                 * when entry is removed from idr */
+                                        &tmp_ientry->wd);
-                ret = idr_get_new_above(&group->inotify_data.idr, entry,
-                                        ++group->inotify_data.last_wd,
-                                        &ientry->wd);
                spin_unlock(&group->inotify_data.idr_lock);
                if (ret) {
                        if (ret == -EAGAIN)
                                goto retry;
                        goto out_err;
                }
+                ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+                if (ret) {
+                        inotify_remove_from_idr(group, tmp_ientry);
+                        if (ret == -EEXIST)
+                                goto find_entry;
+                        goto out_err;
+                }
+                /* tmp_ientry has been added to the inode, so we are all set up.
+                 * now we just need to make sure tmp_ientry doesn't get freed and
+                 * we need to set up entry and ientry so the generic code can
+                 * do its thing. */
+                ientry = tmp_ientry;
+                entry = &ientry->fsn_entry;
+                tmp_ientry = NULL;
                atomic_inc(&group->inotify_data.user->inotify_watches);
+                /* update the idr hint */
+                group->inotify_data.last_wd = ientry->wd;
+                /* we put the mark on the idr, take a reference */
+                fsnotify_get_mark(entry);
        }
+        ret = ientry->wd;
        spin_lock(&entry->lock);
        old_mask = entry->mask;
@@ -525,14 +537,19 @@ retry:
                        fsnotify_recalc_group_mask(group);
        }
-        return ientry->wd;
+        /* this either matches fsnotify_find_mark_entry, or init_mark_entry
+         * depending on which path we took... */
+        fsnotify_put_mark(entry);
 out_err:
-        /* see this isn't supposed to happen, just kill the watch */
+        /* could be an error, could be that we found an existing mark */
-        if (entry) {
+        if (tmp_ientry) {
-                fsnotify_destroy_mark_by_entry(entry);
+                /* on the idr but didn't make it on the inode */
-                fsnotify_put_mark(entry);
+                if (tmp_ientry->wd != -1)
+                        inotify_remove_from_idr(group, tmp_ientry);
+                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
        }
        return ret;
 }
@@ -699,7 +716,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
        fsnotify_get_mark(entry);
        spin_unlock(&group->inotify_data.idr_lock);
-        inotify_destroy_mark_entry(entry, group);
+        fsnotify_destroy_mark_by_entry(entry);
        fsnotify_put_mark(entry);
 out:
@@ -740,9 +757,6 @@ static int __init inotify_user_setup(void)
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
-        inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-        if (!inotify_ignored_event)
-                panic("unable to allocate the inotify ignored event\n");
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756fd..521368574e97 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,18 +136,24 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 {
        if ((old->mask == new->mask) &&
            (old->to_tell == new->to_tell) &&
-            (old->data_type == new->data_type)) {
+            (old->data_type == new->data_type) &&
+            (old->name_len == new->name_len)) {
                switch (old->data_type) {
                case (FSNOTIFY_EVENT_INODE):
-                        if (old->inode == new->inode)
+                        /* remember, after old was put on the wait_q we aren't
+                         * allowed to look at the inode any more, only thing
+                         * left to check was if the file_name is the same */
+                        if (old->name_len &&
+                            !strcmp(old->file_name, new->file_name))
                                return true;
                        break;
                case (FSNOTIFY_EVENT_PATH):
                        if ((old->path.mnt == new->path.mnt) &&
                            (old->path.dentry == new->path.dentry))
                                return true;
+                        break;
                case (FSNOTIFY_EVENT_NONE):
-                        return true;
+                        return false;
                };
        }
        return false;
@@ -339,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event)
 * @name the filename, if available
 */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-                                             int data_type, const char *name, u32 cookie)
+                                             int data_type, const char *name, u32 cookie,
+                                             gfp_t gfp)
 {
        struct fsnotify_event *event;
-        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
        if (!event)
                return NULL;
        initialize_event(event);
        if (name) {
-                event->file_name = kstrdup(name, GFP_KERNEL);
+                event->file_name = kstrdup(name, gfp);
                if (!event->file_name) {
                        kmem_cache_free(fsnotify_event_cachep, event);
                        return NULL;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 82c5085559c6..9938034762cc 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
+#include <linux/log2.h>
 #include "aops.h"
 #include "attrib.h"
@@ -1570,7 +1571,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
        ntfs_debug("Index collation rule is 0x%x.",
                        le32_to_cpu(ir->collation_rule));
        ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-        if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) {
+        if (!is_power_of_2(ni->itype.index.block_size)) {
                ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
                                "two.", ni->itype.index.block_size);
                goto unm_err_out;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index d7932e95b1fd..89b02985c054 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/log2.h>
 #include "attrib.h"
 #include "aops.h"
@@ -65,7 +66,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi,
                        logfile_log_page_size < NTFS_BLOCK_SIZE ||
                        logfile_system_page_size &
                        (logfile_system_page_size - 1) ||
-                        logfile_log_page_size & (logfile_log_page_size - 1)) {
+                        !is_power_of_2(logfile_log_page_size)) {
                ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
                return false;
        }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 678a067d9251..9edcde4974aa 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -475,6 +475,12 @@ struct ocfs2_path {
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+                           u32 cpos);
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+                                           handle_t *handle,
+                                           struct ocfs2_path *path,
+                                           struct ocfs2_extent_rec *insert_rec);
 /*
 * Reset the actual path elements so that we can re-use the structure
 * to build another path. Generally, this involves freeing the buffer
@@ -1013,6 +1019,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 }
 /*
+ * Change range of the branches in the right most path according to the leaf
+ * extent block's rightmost record.
+ */
+static int ocfs2_adjust_rightmost_branch(handle_t *handle,
+                                         struct inode *inode,
+                                         struct ocfs2_extent_tree *et)
+{
+        int status;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        path = ocfs2_new_path_from_et(et);
+        if (!path) {
+                status = -ENOMEM;
+                return status;
+        }
+        status = ocfs2_find_path(inode, path, UINT_MAX);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_extend_trans(handle, path_num_items(path) +
+                                    handle->h_buffer_credits);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_path(inode, handle, path);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
+        ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+out:
+        ocfs2_free_path(path);
+        return status;
+}
+/*
 * Add an entire tree branch to our inode. eb_bh is the extent block
 * to start at, if we don't want to start the branch at the dinode
 * structure.
@@ -1038,7 +1092,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list  *eb_el;
        struct ocfs2_extent_list  *el;
-        u32 new_cpos;
+        u32 new_cpos, root_end;
        mlog_entry_void();
@@ -1055,6 +1109,27 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        new_blocks = le16_to_cpu(el->l_tree_depth);
+        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+        root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
+        /*
+         * If there is a gap before the root end and the real end
+         * of the righmost leaf block, we need to remove the gap
+         * between new_cpos and root_end first so that the tree
+         * is consistent after we add a new branch(it will start
+         * from new_cpos).
+         */
+        if (root_end > new_cpos) {
+                mlog(0, "adjust the cluster end from %u to %u\n",
+                     root_end, new_cpos);
+                status = ocfs2_adjust_rightmost_branch(handle, inode, et);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        /* allocate the number of new eb blocks we need */
        new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
                             GFP_KERNEL);
@@ -1071,9 +1146,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
-        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
-        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
         * linked with the rest of the tree.
         * conversly, new_eb_bhs[0] is the new bottommost leaf.
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 2a947c44e594..a1163b8b417c 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -22,6 +22,9 @@
 #include <linux/crc32.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/fs.h>
 #include <asm/byteorder.h>
 #include <cluster/masklog.h>
@@ -222,6 +225,155 @@ void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
        ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
 }
+/*
+ * Debugfs handling.
+ */
+#ifdef CONFIG_DEBUG_FS
+static int blockcheck_u64_get(void *data, u64 *val)
+{
+        *val = *(u64 *)data;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
+static struct dentry *blockcheck_debugfs_create(const char *name,
+                                                struct dentry *parent,
+                                                u64 *value)
+{
+        return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
+                                   &blockcheck_fops);
+}
+static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+        if (stats) {
+                debugfs_remove(stats->b_debug_check);
+                stats->b_debug_check = NULL;
+                debugfs_remove(stats->b_debug_failure);
+                stats->b_debug_failure = NULL;
+                debugfs_remove(stats->b_debug_recover);
+                stats->b_debug_recover = NULL;
+                debugfs_remove(stats->b_debug_dir);
+                stats->b_debug_dir = NULL;
+        }
+}
+static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+                                          struct dentry *parent)
+{
+        int rc = -EINVAL;
+        if (!stats)
+                goto out;
+        stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+        if (!stats->b_debug_dir)
+                goto out;
+        stats->b_debug_check =
+                blockcheck_debugfs_create("blocks_checked",
+                                          stats->b_debug_dir,
+                                          &stats->b_check_count);
+        stats->b_debug_failure =
+                blockcheck_debugfs_create("checksums_failed",
+                                          stats->b_debug_dir,
+                                          &stats->b_failure_count);
+        stats->b_debug_recover =
+                blockcheck_debugfs_create("ecc_recoveries",
+                                          stats->b_debug_dir,
+                                          &stats->b_recover_count);
+        if (stats->b_debug_check && stats->b_debug_failure &&
+            stats->b_debug_recover)
+                rc = 0;
+out:
+        if (rc)
+                ocfs2_blockcheck_debug_remove(stats);
+        return rc;
+}
+#else
+static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+                                                 struct dentry *parent)
+{
+        return 0;
+}
+static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
+/* Always-called wrappers for starting and stopping the debugfs files */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+                                           struct dentry *parent)
+{
+        return ocfs2_blockcheck_debug_install(stats, parent);
+}
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
+{
+        ocfs2_blockcheck_debug_remove(stats);
+}
+static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
+{
+        u64 new_count;
+        if (!stats)
+                return;
+        spin_lock(&stats->b_lock);
+        stats->b_check_count++;
+        new_count = stats->b_check_count;
+        spin_unlock(&stats->b_lock);
+        if (!new_count)
+                mlog(ML_NOTICE, "Block check count has wrapped\n");
+}
+static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
+{
+        u64 new_count;
+        if (!stats)
+                return;
+        spin_lock(&stats->b_lock);
+        stats->b_failure_count++;
+        new_count = stats->b_failure_count;
+        spin_unlock(&stats->b_lock);
+        if (!new_count)
+                mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
+}
+static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
+{
+        u64 new_count;
+        if (!stats)
+                return;
+        spin_lock(&stats->b_lock);
+        stats->b_recover_count++;
+        new_count = stats->b_recover_count;
+        spin_unlock(&stats->b_lock);
+        if (!new_count)
+                mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
+}
+/*
+ * These are the low-level APIs for using the ocfs2_block_check structure.
+ */
 /*
 * This function generates check information for a block.
 * data is the block to be checked.  bc is a pointer to the
@@ -266,12 +418,15 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
 * Again, the data passed in should be the on-disk endian.
 */
 int ocfs2_block_check_validate(void *data, size_t blocksize,
-                               struct ocfs2_block_check *bc)
+                               struct ocfs2_block_check *bc,
+                               struct ocfs2_blockcheck_stats *stats)
 {
        int rc = 0;
        struct ocfs2_block_check check;
        u32 crc, ecc;
+        ocfs2_blockcheck_inc_check(stats);
        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
@@ -282,6 +437,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
        if (crc == check.bc_crc32e)
                goto out;
+        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -292,8 +448,10 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
        /* And check the crc32 again */
        crc = crc32_le(~0, data, blocksize);
-        if (crc == check.bc_crc32e)
+        if (crc == check.bc_crc32e) {
+                ocfs2_blockcheck_inc_recover(stats);
                goto out;
+        }
        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -366,7 +524,8 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
 * Again, the data passed in should be the on-disk endian.
 */
 int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
-                                   struct ocfs2_block_check *bc)
+                                   struct ocfs2_block_check *bc,
+                                   struct ocfs2_blockcheck_stats *stats)
 {
        int i, rc = 0;
        struct ocfs2_block_check check;
@@ -377,6 +536,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
        if (!nr)
                return 0;
+        ocfs2_blockcheck_inc_check(stats);
        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
@@ -388,6 +549,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
        if (crc == check.bc_crc32e)
                goto out;
+        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -416,8 +578,10 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
        /* And check the crc32 again */
        for (i = 0, crc = ~0; i < nr; i++)
                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
-        if (crc == check.bc_crc32e)
+        if (crc == check.bc_crc32e) {
+                ocfs2_blockcheck_inc_recover(stats);
                goto out;
+        }
        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -448,9 +612,11 @@ int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
                            struct ocfs2_block_check *bc)
 {
        int rc = 0;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
-        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+        if (ocfs2_meta_ecc(osb))
-                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
+                                                &osb->osb_ecc_stats);
        return rc;
 }
@@ -468,9 +634,11 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
                                struct ocfs2_block_check *bc)
 {
        int rc = 0;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
-        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+        if (ocfs2_meta_ecc(osb))
-                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
+                                                    &osb->osb_ecc_stats);
        return rc;
 }
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index 70ec3feda32f..d4b69febf70a 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -21,6 +21,24 @@
 #define OCFS2_BLOCKCHECK_H
+/* Count errors and error correction from blockcheck.c */
+struct ocfs2_blockcheck_stats {
+        spinlock_t b_lock;
+        u64 b_check_count;      /* Number of blocks we've checked */
+        u64 b_failure_count;    /* Number of failed checksums */
+        u64 b_recover_count;    /* Number of blocks fixed by ecc */
+        /*
+         * debugfs entries, used if this is passed to
+         * ocfs2_blockcheck_stats_debugfs_install()
+         */
+        struct dentry *b_debug_dir;     /* Parent of the debugfs  files */
+        struct dentry *b_debug_check;   /* Exposes b_check_count */
+        struct dentry *b_debug_failure; /* Exposes b_failure_count */
+        struct dentry *b_debug_recover; /* Exposes b_recover_count */
+};
 /* High level block API */
 void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
                            struct ocfs2_block_check *bc);
@@ -37,11 +55,18 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
 void ocfs2_block_check_compute(void *data, size_t blocksize,
                               struct ocfs2_block_check *bc);
 int ocfs2_block_check_validate(void *data, size_t blocksize,
-                               struct ocfs2_block_check *bc);
+                               struct ocfs2_block_check *bc,
+                               struct ocfs2_blockcheck_stats *stats);
 void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
                                   struct ocfs2_block_check *bc);
 int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
-                                   struct ocfs2_block_check *bc);
+                                   struct ocfs2_block_check *bc,
+                                   struct ocfs2_blockcheck_stats *stats);
+/* Debug Initialization */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+                                           struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
 /*
 * Hamming code functions
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7e72a81bc2d4..696c32e50716 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -48,34 +48,33 @@
 * only emit the appropriage printk() when the caller passes in a constant
 * mask, as is almost always the case.
 *
- * All this bitmask nonsense is hidden from the /proc interface so that Joel
+ * All this bitmask nonsense is managed from the files under
- * doesn't have an aneurism.  Reading the file gives a straight forward
+ * /sys/fs/o2cb/logmask/.  Reading the files gives a straightforward
- * indication of which bits are on or off:
+ * indication of which bits are allowed (allow) or denied (off/deny).
- *      ENTRY off
+ *      ENTRY deny
- *      EXIT off
+ *      EXIT deny
 *      TCP off
 *      MSG off
 *      SOCKET off
- *      ERROR off
+ *      ERROR allow
- *      NOTICE on
+ *      NOTICE allow
 *
 * Writing changes the state of a given bit and requires a strictly formatted
 * single write() call:
 *
- *      write(fd, "ENTRY on", 8);
+ *      write(fd, "allow", 5);
 *
- * would turn the entry bit on.  "1" is also accepted in the place of "on", and
+ * Echoing allow/deny/off string into the logmask files can flip the bits
- * "off" and "0" behave as expected.
+ * on or off as expected; here is the bash script for example:
 *
- * Some trivial shell can flip all the bits on or off:
+ * log_mask="/sys/fs/o2cb/log_mask"
+ * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
+ *      echo allow >"$log_mask"/"$node"
+ * done
 *
- * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
+ * The debugfs.ocfs2 tool can also flip the bits with the -l option:
- * cat $log_mask | (
+ *
- *      while read bit status; do
+ * debugfs.ocfs2 -l TCP allow
- *              # $1 is "on" or "off", say
- *              echo "$bit $1" > $log_mask
- *      done
- * )
 */
 /* for task_struct */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9fbe849f6344..334f231a422c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -974,7 +974,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                           size_t caller_veclen, u8 target_node, int *status)
 {
-        int ret, error = 0;
+        int ret;
        struct o2net_msg *msg = NULL;
        size_t veclen, caller_bytes = 0;
        struct kvec *vec = NULL;
@@ -1015,10 +1015,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        o2net_set_nst_sock_time(&nst);
-        ret = wait_event_interruptible(nn->nn_sc_wq,
+        wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
-                                       o2net_tx_can_proceed(nn, &sc, &error));
-        if (!ret && error)
-                ret = error;
        if (ret)
                goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c5752305627c..b358f3bf896d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2900,6 +2900,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        alloc = ocfs2_clusters_for_bytes(sb, bytes);
        dx_alloc = 0;
+        down_write(&oi->ip_alloc_sem);
        if (ocfs2_supports_indexed_dirs(osb)) {
                credits += ocfs2_add_dir_index_credits(sb);
@@ -2940,8 +2942,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out;
        }
-        down_write(&oi->ip_alloc_sem);
        /*
         * Prepare for worst case allocation scenario of two separate
         * extents in the unindexed tree.
@@ -2953,7 +2953,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_sem;
+                goto out;
        }
        if (vfs_dq_alloc_space_nodirty(dir,
@@ -3172,10 +3172,8 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
-out_sem:
-        up_write(&oi->ip_alloc_sem);
 out:
+        up_write(&oi->ip_alloc_sem);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
        if (meta_ac)
@@ -3322,11 +3320,15 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                brelse(new_bh);
                new_bh = NULL;
+                down_write(&OCFS2_I(dir)->ip_alloc_sem);
+                drop_alloc_sem = 1;
                dir_i_size = i_size_read(dir);
                credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
                goto do_extend;
        }
+        down_write(&OCFS2_I(dir)->ip_alloc_sem);
+        drop_alloc_sem = 1;
        dir_i_size = i_size_read(dir);
        mlog(0, "extending dir %llu (i_size = %lld)\n",
             (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
@@ -3370,9 +3372,6 @@ do_extend:
                credits++; /* For attaching the new dirent block to the
                            * dx_root */
-        down_write(&OCFS2_I(dir)->ip_alloc_sem);
-        drop_alloc_sem = 1;
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
@@ -3435,10 +3434,10 @@ bail_bh:
        *new_de_bh = new_bh;
        get_bh(*new_de_bh);
 bail:
-        if (drop_alloc_sem)
-                up_write(&OCFS2_I(dir)->ip_alloc_sem);
        if (handle)
                ocfs2_commit_trans(osb, handle);
+        if (drop_alloc_sem)
+                up_write(&OCFS2_I(dir)->ip_alloc_sem);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d50827..110bb57c46ab 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -92,6 +92,9 @@ struct ocfs2_unblock_ctl {
        enum ocfs2_unblock_action unblock_action;
 };
+/* Lockdep class keys */
+struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
 static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
                                        int new_level);
 static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
@@ -248,6 +251,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .get_osb        = ocfs2_get_dentry_osb,
        .post_unlock    = ocfs2_dentry_post_unlock,
@@ -313,9 +320,16 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
                             u32 dlm_flags);
 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
                                                     int wanted);
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
-                                 struct ocfs2_lock_res *lockres,
+                                   struct ocfs2_lock_res *lockres,
-                                 int level);
+                                   int level, unsigned long caller_ip);
+static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+                                        struct ocfs2_lock_res *lockres,
+                                        int level)
+{
+        __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
+}
 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
@@ -485,6 +499,13 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
        ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
        ocfs2_init_lock_stats(res);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (type != OCFS2_LOCK_TYPE_OPEN)
+                lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
+                                 &lockdep_keys[type], 0);
+        else
+                res->l_lockdep_map.key = NULL;
+#endif
 }
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -637,6 +658,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_nfs_sync_lops, osb);
 }
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+                                            struct ocfs2_super *osb)
+{
+        ocfs2_lock_res_init_once(res);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+                                   &ocfs2_orphan_scan_lops, osb);
+}
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp)
 {
@@ -1239,11 +1269,13 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
        return ret;
 }
-static int ocfs2_cluster_lock(struct ocfs2_super *osb,
+static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
-                              struct ocfs2_lock_res *lockres,
+                                struct ocfs2_lock_res *lockres,
-                              int level,
+                                int level,
-                              u32 lkm_flags,
+                                u32 lkm_flags,
-                              int arg_flags)
+                                int arg_flags,
+                                int l_subclass,
+                                unsigned long caller_ip)
 {
        struct ocfs2_mask_waiter mw;
        int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
@@ -1386,13 +1418,37 @@ out:
        }
        ocfs2_update_lock_stats(lockres, level, &mw, ret);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (!ret && lockres->l_lockdep_map.key != NULL) {
+                if (level == DLM_LOCK_PR)
+                        rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
+                                !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+                                caller_ip);
+                else
+                        rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
+                                !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+                                caller_ip);
+        }
+#endif
        mlog_exit(ret);
        return ret;
 }
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
-                                 struct ocfs2_lock_res *lockres,
+                                     struct ocfs2_lock_res *lockres,
-                                 int level)
+                                     int level,
+                                     u32 lkm_flags,
+                                     int arg_flags)
+{
+        return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
+                                    0, _RET_IP_);
+}
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+                                   struct ocfs2_lock_res *lockres,
+                                   int level,
+                                   unsigned long caller_ip)
 {
        unsigned long flags;
@@ -1401,6 +1457,10 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
        ocfs2_dec_holders(lockres, level);
        ocfs2_downconvert_on_unlock(osb, lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (lockres->l_lockdep_map.key != NULL)
+                rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+#endif
        mlog_exit_void();
 }
@@ -1972,7 +2032,8 @@ static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
 {
        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-        if (lvb->lvb_version == OCFS2_LVB_VERSION
+        if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
+            && lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
                return 1;
        return 0;
@@ -2145,10 +2206,11 @@ static int ocfs2_assign_bh(struct inode *inode,
 * returns < 0 error if the callback will never be called, otherwise
 * the result of the lock will be communicated via the callback.
 */
-int ocfs2_inode_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full_nested(struct inode *inode,
-                         struct buffer_head **ret_bh,
+                                 struct buffer_head **ret_bh,
-                         int ex,
+                                 int ex,
-                         int arg_flags)
+                                 int arg_flags,
+                                 int subclass)
 {
        int status, level, acquired;
        u32 dlm_flags;
@@ -2186,7 +2248,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
        if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
                dlm_flags |= DLM_LKF_NOQUEUE;
-        status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+        status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
+                                      arg_flags, subclass, _RET_IP_);
        if (status < 0) {
                if (status != -EAGAIN && status != -EIOCBRETRY)
                        mlog_errno(status);
@@ -2352,6 +2415,47 @@ void ocfs2_inode_unlock(struct inode *inode,
        mlog_exit_void();
 }
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
+{
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_orphan_scan_lvb *lvb;
+        int status = 0;
+        if (ocfs2_is_hard_readonly(osb))
+                return -EROFS;
+        if (ocfs2_mount_local(osb))
+                return 0;
+        lockres = &osb->osb_orphan_scan.os_lockres;
+        status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+        if (status < 0)
+                return status;
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+            lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+                *seqno = be32_to_cpu(lvb->lvb_os_seqno);
+        else
+                *seqno = osb->osb_orphan_scan.os_seqno + 1;
+        return status;
+}
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
+{
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_orphan_scan_lvb *lvb;
+        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
+                lockres = &osb->osb_orphan_scan.os_lockres;
+                lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+                lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+                lvb->lvb_os_seqno = cpu_to_be32(seqno);
+                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+        }
+}
 int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex)
 {
@@ -2842,6 +2946,7 @@ local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
        ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+        ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
        osb->cconn = conn;
@@ -2878,6 +2983,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
        ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+        ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
        osb->cconn = NULL;
@@ -3061,6 +3167,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
        ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+        ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
 }
 int ocfs2_drop_inode_locks(struct inode *inode)
@@ -3576,7 +3683,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
        struct ocfs2_global_disk_dqinfo *gdinfo;
        int status = 0;
-        if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+        if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+            lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
                info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
                info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
                oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd5721cd7f..7553836931de 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
        __be32  lvb_free_entry;
 };
+#define OCFS2_ORPHAN_LVB_VERSION 1
+struct ocfs2_orphan_scan_lvb {
+        __u8    lvb_version;
+        __u8    lvb_reserved[3];
+        __be32  lvb_os_seqno;
+};
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
@@ -70,6 +78,14 @@ struct ocfs2_qinfo_lvb {
 /* don't block waiting for the downconvert thread, instead return -EAGAIN */
 #define OCFS2_LOCK_NONBLOCK             (0x04)
+/* Locking subclasses of inode cluster lock */
+enum {
+        OI_LS_NORMAL = 0,
+        OI_LS_PARENT,
+        OI_LS_RENAME1,
+        OI_LS_RENAME2,
+};
 int ocfs2_dlm_init(struct ocfs2_super *osb);
 void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
@@ -96,23 +112,32 @@ void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_inode_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
-int ocfs2_inode_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full_nested(struct inode *inode,
                         struct buffer_head **ret_bh,
                         int ex,
-                         int arg_flags);
+                         int arg_flags,
+                         int subclass);
 int ocfs2_inode_lock_with_page(struct inode *inode,
                              struct buffer_head **ret_bh,
                              int ex,
                              struct page *page);
+/* Variants without special locking class or flags */
+#define ocfs2_inode_lock_full(i, r, e, f)\
+                ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
+#define ocfs2_inode_lock_nested(i, b, e, s)\
+                ocfs2_inode_lock_full_nested(i, b, e, 0, s)
 /* 99% of the time we don't want to supply any additional flags --
 * those are for very specific cases only. */
-#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
 void ocfs2_inode_unlock(struct inode *inode,
                       int ex);
 int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex);
 void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
 int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c2a87c885b73..62442e413a00 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,6 +187,9 @@ static int ocfs2_sync_file(struct file *file,
        if (err)
                goto bail;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                goto bail;
        journal = osb->journal->j_journal;
        err = jbd2_journal_force_commit(journal);
@@ -894,9 +897,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
-        int locked[MAXQUOTAS] = {0, 0};
+        int qtype;
-        int credits, qtype;
+        struct dquot *transfer_from[MAXQUOTAS] = { };
-        struct ocfs2_mem_dqinfo *oinfo;
+        struct dquot *transfer_to[MAXQUOTAS] = { };
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -969,30 +972,37 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                credits = OCFS2_INODE_UPDATE_CREDITS;
+                /*
+                 * Gather pointers to quota structures so that allocation /
+                 * freeing of quota structures happens here and not inside
+                 * vfs_dq_transfer() where we have problems with lock ordering
+                 */
                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
-                        oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+                        transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
-                        status = ocfs2_lock_global_qf(oinfo, 1);
+                                                      USRQUOTA);
-                        if (status < 0)
+                        transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
+                                                        USRQUOTA);
+                        if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
+                                status = -ESRCH;
                                goto bail_unlock;
-                        credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                        }
-                                ocfs2_calc_qdel_credits(sb, USRQUOTA);
-                        locked[USRQUOTA] = 1;
                }
                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
-                        oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+                        transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
-                        status = ocfs2_lock_global_qf(oinfo, 1);
+                                                      GRPQUOTA);
-                        if (status < 0)
+                        transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
+                                                        GRPQUOTA);
+                        if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
+                                status = -ESRCH;
                                goto bail_unlock;
-                        credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+                        }
-                                   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
-                        locked[GRPQUOTA] = 1;
                }
-                handle = ocfs2_start_trans(osb, credits);
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
+                                           2 * ocfs2_quota_trans_credits(sb));
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
                        mlog_errno(status);
@@ -1030,12 +1040,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
-                if (!locked[qtype])
-                        continue;
-                oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
-                ocfs2_unlock_global_qf(oinfo, 1);
-        }
        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
@@ -1043,6 +1047,12 @@ bail_unlock_rw:
 bail:
        brelse(bh);
+        /* Release quota pointers in case we acquired them */
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+                dqput(transfer_to[qtype]);
+                dqput(transfer_from[qtype]);
+        }
        if (!status && attr->ia_valid & ATTR_MODE) {
                status = ocfs2_acl_chmod(inode);
                if (status < 0)
@@ -2016,7 +2026,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
                                      size_t len,
                                      unsigned int flags)
 {
-        int ret = 0;
+        int ret = 0, lock_level = 0;
        struct inode *inode = in->f_path.dentry->d_inode;
        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
@@ -2027,12 +2037,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        /*
         * See the comment in ocfs2_file_aio_read()
         */
-        ret = ocfs2_inode_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto bail;
        }
-        ocfs2_inode_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, lock_level);
        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 10e1fa87396a..4dc8890ba316 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -215,6 +215,8 @@ bail:
 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 {
        struct ocfs2_find_inode_args *args = opaque;
+        static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
+                                     ocfs2_file_ip_alloc_sem_key;
        mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
@@ -223,6 +225,15 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        if (args->fi_sysfile_type != 0)
                lockdep_set_class(&inode->i_mutex,
                        &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+        if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
+            args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
+            args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+            args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE)
+                lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+                                  &ocfs2_quota_ip_alloc_sem_key);
+        else
+                lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+                                  &ocfs2_file_ip_alloc_sem_key);
        mlog_exit(0);
        return 0;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 9fcd36dcc9a0..467b413bec21 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,7 +7,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1e37fd..f033760ecbea 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
 #define MLOG_MASK_PREFIX ML_JOURNAL
 #include <cluster/masklog.h>
@@ -52,6 +54,8 @@
 DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
                              int node_num, int slot_num);
@@ -1841,6 +1845,128 @@ bail:
        return status;
 }
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+        unsigned long time;
+        get_random_bytes(&time, sizeof(time));
+        time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+        return msecs_to_jiffies(time);
+}
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
+ * is done to catch any orphans that are left over in orphan directories.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
+ * seconds.  It gets an EX lock on os_lockres and checks sequence number
+ * stored in LVB. If the sequence number has changed, it means some other
+ * node has done the scan.  This node skips the scan and tracks the
+ * sequence number.  If the sequence number didn't change, it means a scan
+ * hasn't happened.  The node queues a scan and increments the
+ * sequence number in the LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        int status, i;
+        u32 seqno = 0;
+        os = &osb->osb_orphan_scan;
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+                goto out;
+        status = ocfs2_orphan_scan_lock(osb, &seqno);
+        if (status < 0) {
+                if (status != -EAGAIN)
+                        mlog_errno(status);
+                goto out;
+        }
+        /* Do no queue the tasks if the volume is being umounted */
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+                goto unlock;
+        if (os->os_seqno != seqno) {
+                os->os_seqno = seqno;
+                goto unlock;
+        }
+        for (i = 0; i < osb->max_slots; i++)
+                ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+                                                NULL);
+        /*
+         * We queued a recovery on orphan slots, increment the sequence
+         * number and update LVB so other node will skip the scan for a while
+         */
+        seqno++;
+        os->os_count++;
+        os->os_scantime = CURRENT_TIME;
+unlock:
+        ocfs2_orphan_scan_unlock(osb, seqno);
+out:
+        return;
+}
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+        struct ocfs2_orphan_scan *os;
+        struct ocfs2_super *osb;
+        os = container_of(work, struct ocfs2_orphan_scan,
+                          os_orphan_scan_work.work);
+        osb = os->os_osb;
+        mutex_lock(&os->os_lock);
+        ocfs2_queue_orphan_scan(osb);
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
+                schedule_delayed_work(&os->os_orphan_scan_work,
+                                      ocfs2_orphan_scan_timeout());
+        mutex_unlock(&os->os_lock);
+}
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        os = &osb->osb_orphan_scan;
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) {
+                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+                mutex_lock(&os->os_lock);
+                cancel_delayed_work(&os->os_orphan_scan_work);
+                mutex_unlock(&os->os_lock);
+        }
+}
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        os = &osb->osb_orphan_scan;
+        os->os_osb = osb;
+        os->os_count = 0;
+        os->os_seqno = 0;
+        os->os_scantime = CURRENT_TIME;
+        mutex_init(&os->os_lock);
+        INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+        else {
+                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
+                schedule_delayed_work(&os->os_orphan_scan_work,
+                                      ocfs2_orphan_scan_timeout());
+        }
+}
 struct ocfs2_orphan_filldir_priv {
        struct inode            *head;
        struct ocfs2_super      *osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index eb7b76331eb7..5432c7f79cc6 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 }
 /* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
 void ocfs2_complete_recovery(struct work_struct *work);
 void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 33464c6b60a2..8601f934010b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -118,7 +118,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
             dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        status = ocfs2_inode_lock(dir, NULL, 0);
+        status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -636,7 +636,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
-        err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+        err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
@@ -800,7 +800,8 @@ static int ocfs2_unlink(struct inode *dir,
                return -EPERM;
        }
-        status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
+        status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
+                                         OI_LS_PARENT);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -978,7 +979,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                        inode1 = tmpinode;
                }
                /* lock id2 */
-                status = ocfs2_inode_lock(inode2, bh2, 1);
+                status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+                                                 OI_LS_RENAME1);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -987,7 +989,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        }
        /* lock id1 */
-        status = ocfs2_inode_lock(inode1, bh1, 1);
+        status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
        if (status < 0) {
                /*
                 * An error return must mean that no cluster locks
@@ -1103,7 +1105,8 @@ static int ocfs2_rename(struct inode *old_dir,
         * won't have to concurrently downconvert the inode and the
         * dentry locks.
         */
-        status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
+        status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1,
+                                         OI_LS_PARENT);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281950db..c9345ebb8493 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,6 +34,7 @@
 #include <linux/workqueue.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
+#include <linux/lockdep.h>
 #ifndef CONFIG_OCFS2_COMPAT_JBD
 # include <linux/jbd2.h>
 #else
@@ -47,6 +48,9 @@
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
+/* For struct ocfs2_blockcheck_stats */
+#include "blockcheck.h"
 /* Most user visible OCFS2 inodes will have very few pieces of
 * metadata, but larger files (including bitmaps, etc) must be taken
 * into account when designing an access scheme. We allow a small
@@ -149,6 +153,25 @@ struct ocfs2_lock_res {
        unsigned int             l_lock_max_exmode;        /* Max wait for EX */
        unsigned int             l_lock_refresh;           /* Disk refreshes */
 #endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        struct lockdep_map       l_lockdep_map;
+#endif
+};
+enum ocfs2_orphan_scan_state {
+        ORPHAN_SCAN_ACTIVE,
+        ORPHAN_SCAN_INACTIVE
+};
+struct ocfs2_orphan_scan {
+        struct mutex            os_lock;
+        struct ocfs2_super      *os_osb;
+        struct ocfs2_lock_res   os_lockres;     /* lock to synchronize scans */
+        struct delayed_work     os_orphan_scan_work;
+        struct timespec         os_scantime;  /* time this node ran the scan */
+        u32                     os_count;      /* tracks node specific scans */
+        u32                     os_seqno;       /* tracks cluster wide scans */
+        atomic_t                os_state;              /* ACTIVE or INACTIVE */
 };
 struct ocfs2_dlm_debug {
@@ -295,6 +318,7 @@ struct ocfs2_super
        struct ocfs2_dinode *local_alloc_copy;
        struct ocfs2_quota_recovery *quota_rec;
+        struct ocfs2_blockcheck_stats osb_ecc_stats;
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
@@ -341,6 +365,8 @@ struct ocfs2_super
        unsigned int                    *osb_orphan_wipes;
        wait_queue_head_t               osb_wipe_event;
+        struct ocfs2_orphan_scan        osb_orphan_scan;
        /* used to protect metaecc calculation check of xattr. */
        spinlock_t osb_xattr_lock;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87481bf..fcdba091af3d 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_LOCK_TYPE_QINFO,
        OCFS2_LOCK_TYPE_NFS_SYNC,
+        OCFS2_LOCK_TYPE_ORPHAN_SCAN,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_NFS_SYNC:
                        c = 'Y';
                        break;
+                case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+                        c = 'P';
+                        break;
                default:
                        c = '\0';
        }
@@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
        [OCFS2_LOCK_TYPE_QINFO] = "Quota",
+        [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 1ed0f7c86869..edfa60cd155c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -421,6 +421,7 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
        if (!dquot->dq_off) {   /* No real quota entry? */
                /* Upgrade to exclusive lock for allocation */
+                ocfs2_qinfo_unlock(info, 0);
                err = ocfs2_qinfo_lock(info, 1);
                if (err < 0)
                        goto out_qlock;
@@ -435,7 +436,8 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
 out_qlock:
        if (ex)
                ocfs2_qinfo_unlock(info, 1);
-        ocfs2_qinfo_unlock(info, 0);
+        else
+                ocfs2_qinfo_unlock(info, 0);
 out:
        if (err < 0)
                mlog_errno(err);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 07deec5e9721..5a460fa82553 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -444,10 +444,6 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
        mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
-        status = ocfs2_lock_global_qf(oinfo, 1);
-        if (status < 0)
-                goto out;
        list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
                chunk = rchunk->rc_chunk;
                hbh = NULL;
@@ -480,12 +476,18 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                                     type);
                                goto out_put_bh;
                        }
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_put_dquot;
+                        }
                        handle = ocfs2_start_trans(OCFS2_SB(sb),
                                                   OCFS2_QSYNC_CREDITS);
                        if (IS_ERR(handle)) {
                                status = PTR_ERR(handle);
                                mlog_errno(status);
-                                goto out_put_dquot;
+                                goto out_drop_lock;
                        }
                        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
                        spin_lock(&dq_data_lock);
@@ -523,6 +525,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 out_commit:
                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_drop_lock:
+                        ocfs2_unlock_global_qf(oinfo, 1);
 out_put_dquot:
                        dqput(dquot);
 out_put_bh:
@@ -537,8 +541,6 @@ out_put_bh:
                if (status < 0)
                        break;
        }
-        ocfs2_unlock_global_qf(oinfo, 1);
-out:
        if (status < 0)
                free_recovery_list(&(rec->r_list[type]));
        mlog_exit(status);
@@ -655,6 +657,9 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        struct ocfs2_quota_recovery *rec;
        int locked = 0;
+        /* We don't need the lock and we have to acquire quota file locks
+         * which will later depend on this lock */
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        info->dqi_maxblimit = 0x7fffffffffffffffLL;
        info->dqi_maxilimit = 0x7fffffffffffffffLL;
        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
@@ -733,6 +738,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
                goto out_err;
        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        return 0;
 out_err:
        if (oinfo) {
@@ -746,6 +752,7 @@ out_err:
                kfree(oinfo);
        }
        brelse(bh);
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        return -1;
 }
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index fcd120f1493a..3f661376a2de 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -236,6 +236,16 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
        return dlm_status_to_errno(lksb->lksb_o2dlm.status);
 }
+/*
+ * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * contents, it will zero out the LVB.  Thus the caller can always trust
+ * the contents.
+ */
+static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+{
+        return 1;
+}
 static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
        return (void *)(lksb->lksb_o2dlm.lvb);
@@ -354,6 +364,7 @@ static struct ocfs2_stack_operations o2cb_stack_ops = {
        .dlm_lock       = o2cb_dlm_lock,
        .dlm_unlock     = o2cb_dlm_unlock,
        .lock_status    = o2cb_dlm_lock_status,
+        .lvb_valid      = o2cb_dlm_lvb_valid,
        .lock_lvb       = o2cb_dlm_lvb,
        .dump_lksb      = o2cb_dump_lksb,
 };
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 9b76d41a8ac6..ff4c798a5635 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -738,6 +738,13 @@ static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
        return lksb->lksb_fsdlm.sb_status;
 }
+static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+{
+        int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
+        return !invalid;
+}
 static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
        if (!lksb->lksb_fsdlm.sb_lvbptr)
@@ -873,6 +880,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
        .dlm_lock       = user_dlm_lock,
        .dlm_unlock     = user_dlm_unlock,
        .lock_status    = user_dlm_lock_status,
+        .lvb_valid      = user_dlm_lvb_valid,
        .lock_lvb       = user_dlm_lvb,
        .plock          = user_plock,
        .dump_lksb      = user_dlm_dump_lksb,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 68b668b0e60a..3f2f1c45b7b6 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -6,7 +6,7 @@
 * Code which implements an OCFS2 specific interface to underlying
 * cluster stacks.
 *
- * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2007, 2009 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -271,11 +271,12 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
-/*
+int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
- * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
+{
- * don't cast at the glue level.  The real answer is that the header
+        return active_stack->sp_ops->lvb_valid(lksb);
- * ordering is nigh impossible.
+}
- */
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
        return active_stack->sp_ops->lock_lvb(lksb);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index c571af375ef8..03a44d60eac9 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -186,6 +186,11 @@ struct ocfs2_stack_operations {
        int (*lock_status)(union ocfs2_dlm_lksb *lksb);
        /*
+         * Return non-zero if the LVB is valid.
+         */
+        int (*lvb_valid)(union ocfs2_dlm_lksb *lksb);
+        /*
         * Pull the lvb pointer off of the stack-specific lksb.
         */
        void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
@@ -252,6 +257,7 @@ int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
                     struct ocfs2_lock_res *astarg);
 int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8439f6b324b9..73a16d4666dc 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -923,14 +923,23 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr)
 {
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+        int ret;
        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
                return 0;
-        if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+        if (!buffer_jbd(bg_bh))
                return 1;
+        jbd_lock_bh_state(bg_bh);
        bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
-        return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+        if (bg)
+                ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+        else
+                ret = 1;
+        jbd_unlock_bh_state(bg_bh);
+        return ret;
 }
 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
@@ -1885,6 +1894,7 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        unsigned int tmp;
        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
        struct ocfs2_group_desc *undo_bg = NULL;
+        int cluster_bitmap = 0;
        mlog_entry_void();
@@ -1905,18 +1915,28 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        }
        if (ocfs2_is_cluster_bitmap(alloc_inode))
-                undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+                cluster_bitmap = 1;
+        if (cluster_bitmap) {
+                jbd_lock_bh_state(group_bh);
+                undo_bg = (struct ocfs2_group_desc *)
+                                        bh2jh(group_bh)->b_committed_data;
+                BUG_ON(!undo_bg);
+        }
        tmp = num_bits;
        while(tmp--) {
                ocfs2_clear_bit((bit_off + tmp),
                                (unsigned long *) bg->bg_bitmap);
-                if (ocfs2_is_cluster_bitmap(alloc_inode))
+                if (cluster_bitmap)
                        ocfs2_set_bit(bit_off + tmp,
                                      (unsigned long *) undo_bg->bg_bitmap);
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+        if (cluster_bitmap)
+                jbd_unlock_bh_state(group_bh);
        status = ocfs2_journal_dirty(handle, group_bh);
        if (status < 0)
                mlog_errno(status);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 201b40a441fe..7efb349fb9bd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -119,10 +119,12 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                               struct buffer_head *bh,
-                               u32 sectsize);
+                               u32 sectsize,
+                               struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_initialize_super(struct super_block *sb,
                                  struct buffer_head *bh,
-                                  int sector_size);
+                                  int sector_size,
+                                  struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_get_sector(struct super_block *sb,
                            struct buffer_head **bh,
                            int block,
@@ -203,10 +205,10 @@ static const match_table_t tokens = {
 #ifdef CONFIG_DEBUG_FS
 static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 {
-        int out = 0;
-        int i;
        struct ocfs2_cluster_connection *cconn = osb->cconn;
        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
+        int i, out = 0;
        out += snprintf(buf + out, len - out,
                        "%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
@@ -231,20 +233,24 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
                        "%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
                        osb->s_mount_opt, osb->s_atime_quantum);
-        out += snprintf(buf + out, len - out,
+        if (cconn) {
-                        "%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+                out += snprintf(buf + out, len - out,
-                        "Cluster",
+                                "%10s => Stack: %s  Name: %*s  "
-                        (*osb->osb_cluster_stack == '\0' ?
+                                "Version: %d.%d\n", "Cluster",
-                         "o2cb" : osb->osb_cluster_stack),
+                                (*osb->osb_cluster_stack == '\0' ?
-                        cconn->cc_namelen, cconn->cc_name,
+                                 "o2cb" : osb->osb_cluster_stack),
-                        cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+                                cconn->cc_namelen, cconn->cc_name,
+                                cconn->cc_version.pv_major,
+                                cconn->cc_version.pv_minor);
+        }
        spin_lock(&osb->dc_task_lock);
        out += snprintf(buf + out, len - out,
                        "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
                        "WorkSeq: %lu\n", "DownCnvt",
-                        task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+                        (osb->dc_task ?  task_pid_nr(osb->dc_task) : -1),
-                        osb->dc_wake_sequence, osb->dc_work_sequence);
+                        osb->blocked_lock_count, osb->dc_wake_sequence,
+                        osb->dc_work_sequence);
        spin_unlock(&osb->dc_task_lock);
        spin_lock(&osb->osb_lock);
@@ -264,14 +270,15 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
        out += snprintf(buf + out, len - out,
                        "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
-                        task_pid_nr(osb->commit_task), osb->osb_commit_interval,
+                        (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
+                        osb->osb_commit_interval,
                        atomic_read(&osb->needs_checkpoint));
        out += snprintf(buf + out, len - out,
-                        "%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
+                        "%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
                        "Journal", osb->journal->j_state,
-                        atomic_read(&osb->journal->j_num_trans),
+                        osb->journal->j_trans_id,
-                        osb->journal->j_trans_id);
+                        atomic_read(&osb->journal->j_num_trans));
        out += snprintf(buf + out, len - out,
                        "%10s => GlobalAllocs: %d  LocalAllocs: %d  "
@@ -297,9 +304,18 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
                        atomic_read(&osb->s_num_inodes_stolen));
        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "OrphanScan => ");
+        out += snprintf(buf + out, len - out, "Local: %u  Global: %u ",
+                        os->os_count, os->os_seqno);
+        out += snprintf(buf + out, len - out, " Last Scan: ");
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+                out += snprintf(buf + out, len - out, "Disabled\n");
+        else
+                out += snprintf(buf + out, len - out, "%lu seconds ago\n",
+                                (get_seconds() - os->os_scantime.tv_sec));
        out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
                        "Slots", "Num", "RecoGen");
        for (i = 0; i < osb->max_slots; ++i) {
                out += snprintf(buf + out, len - out,
                                "%10s  %c %3d  %10d\n",
@@ -542,7 +558,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
         */
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
        BUILD_BUG_ON(sizeof(sector_t) != 8);
        /*
         * We might be limited by page cache size.
@@ -693,7 +709,8 @@ out:
 static int ocfs2_sb_probe(struct super_block *sb,
                          struct buffer_head **bh,
-                          int *sector_size)
+                          int *sector_size,
+                          struct ocfs2_blockcheck_stats *stats)
 {
        int status, tmpstat;
        struct ocfs1_vol_disk_hdr *hdr;
@@ -759,7 +776,8 @@ static int ocfs2_sb_probe(struct super_block *sb,
                        goto bail;
                }
                di = (struct ocfs2_dinode *) (*bh)->b_data;
-                status = ocfs2_verify_volume(di, *bh, blksize);
+                memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+                status = ocfs2_verify_volume(di, *bh, blksize, stats);
                if (status >= 0)
                        goto bail;
                brelse(*bh);
@@ -965,6 +983,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        struct ocfs2_super *osb = NULL;
        struct buffer_head *bh = NULL;
        char nodestr[8];
+        struct ocfs2_blockcheck_stats stats;
        mlog_entry("%p, %p, %i", sb, data, silent);
@@ -974,13 +993,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* probe for superblock */
-        status = ocfs2_sb_probe(sb, &bh, &sector_size);
+        status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
        if (status < 0) {
                mlog(ML_ERROR, "superblock probe failed!\n");
                goto read_super_error;
        }
-        status = ocfs2_initialize_super(sb, bh, sector_size);
+        status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
        osb = OCFS2_SB(sb);
        if (status < 0) {
                mlog_errno(status);
@@ -1090,6 +1109,18 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
+        if (ocfs2_meta_ecc(osb)) {
+                status = ocfs2_blockcheck_stats_debugfs_install(
+                                                &osb->osb_ecc_stats,
+                                                osb->osb_debug_root);
+                if (status) {
+                        mlog(ML_ERROR,
+                             "Unable to create blockcheck statistics "
+                             "files\n");
+                        goto read_super_error;
+                }
+        }
        status = ocfs2_mount_volume(sb);
        if (osb->root_inode)
                inode = igrab(osb->root_inode);
@@ -1150,6 +1181,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
        wake_up(&osb->osb_mount_event);
+        /* Start this when the mount is almost sure of being successful */
+        ocfs2_orphan_scan_init(osb);
        mlog_exit(status);
        return status;
@@ -1760,13 +1794,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
        }
        status = ocfs2_truncate_log_init(osb);
-        if (status < 0) {
+        if (status < 0)
                mlog_errno(status);
-                goto leave;
-        }
-        if (ocfs2_mount_local(osb))
-                goto leave;
 leave:
        if (unlock_super)
@@ -1790,6 +1819,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        debugfs_remove(osb->osb_ctxt);
+        /* Orphan scan should be stopped as early as possible */
+        ocfs2_orphan_scan_stop(osb);
        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
@@ -1833,6 +1865,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        if (osb->cconn)
                ocfs2_dlm_shutdown(osb, hangup_needed);
+        ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
        debugfs_remove(osb->osb_debug_root);
        if (hangup_needed)
@@ -1880,7 +1913,8 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
 static int ocfs2_initialize_super(struct super_block *sb,
                                  struct buffer_head *bh,
-                                  int sector_size)
+                                  int sector_size,
+                                  struct ocfs2_blockcheck_stats *stats)
 {
        int status;
        int i, cbits, bbits;
@@ -1939,6 +1973,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
        atomic_set(&osb->alloc_stats.bg_allocs, 0);
        atomic_set(&osb->alloc_stats.bg_extends, 0);
+        /* Copy the blockcheck stats from the superblock probe */
+        osb->osb_ecc_stats = *stats;
        ocfs2_init_node_maps(osb);
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
@@ -2169,7 +2206,8 @@ bail:
 */
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                               struct buffer_head *bh,
-                               u32 blksz)
+                               u32 blksz,
+                               struct ocfs2_blockcheck_stats *stats)
 {
        int status = -EAGAIN;
@@ -2182,7 +2220,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                    OCFS2_FEATURE_INCOMPAT_META_ECC) {
                        status = ocfs2_block_check_validate(bh->b_data,
                                                            bh->b_size,
-                                                            &di->i_check);
+                                                            &di->i_check,
+                                                            stats);
                        if (status)
                                goto out;
                }
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index ab713ebdd546..40e53702948c 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -50,6 +50,10 @@ static inline int is_in_system_inode_array(struct ocfs2_super *osb,
                                           int type,
                                           u32 slot);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
+#endif
 static inline int is_global_system_inode(int type)
 {
        return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
@@ -118,6 +122,21 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                inode = NULL;
                goto bail;
        }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+            type == LOCAL_GROUP_QUOTA_SYSTEM_INODE ||
+            type == JOURNAL_SYSTEM_INODE) {
+                /* Ignore inode lock on these inodes as the lock does not
+                 * really belong to any process and lockdep cannot handle
+                 * that */
+                OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL;
+        } else {
+                lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres.
+                                                                l_lockdep_map,
+                                 ocfs2_system_inodes[type].si_name,
+                                 &ocfs2_sysfile_cluster_lock_key[type], 0);
+        }
+#endif
 bail:
        return inode;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 15631019dc63..ba320e250747 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3154,7 +3154,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
                if (func) {
                        ret = func(inode, bucket, para);
-                        if (ret)
+                        if (ret && ret != -ERANGE)
                                mlog_errno(ret);
                        /* Fall through to bucket_relse() */
                }
@@ -3261,7 +3261,8 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
                                                  ocfs2_list_xattr_bucket,
                                                  &xl);
                if (ret) {
-                        mlog_errno(ret);
+                        if (ret != -ERANGE)
+                                mlog_errno(ret);
                        goto out;
                }
diff --git a/fs/open.c b/fs/open.c
index 7200e23d9258..dd98e8076024 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -378,63 +378,63 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 #endif
 #endif /* BITS_PER_LONG == 32 */
-SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
-        struct file *file;
+        struct inode *inode = file->f_path.dentry->d_inode;
-        struct inode *inode;
+        long ret;
-        long ret = -EINVAL;
        if (offset < 0 || len <= 0)
-                goto out;
+                return -EINVAL;
        /* Return error if mode is not supported */
-        ret = -EOPNOTSUPP;
        if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
-                goto out;
+                return -EOPNOTSUPP;
-        ret = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
        if (!(file->f_mode & FMODE_WRITE))
-                goto out_fput;
+                return -EBADF;
        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
-                goto out_fput;
+                return ret;
-        inode = file->f_path.dentry->d_inode;
-        ret = -ESPIPE;
        if (S_ISFIFO(inode->i_mode))
-                goto out_fput;
+                return -ESPIPE;
-        ret = -ENODEV;
        /*
         * Let individual file system decide if it supports preallocation
         * for directories or not.
         */
        if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-                goto out_fput;
+                return -ENODEV;
-        ret = -EFBIG;
        /* Check for wrap through zero too */
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
-                goto out_fput;
+                return -EFBIG;
-        if (inode->i_op->fallocate)
+        if (!inode->i_op->fallocate)
-                ret = inode->i_op->fallocate(inode, mode, offset, len);
+                return -EOPNOTSUPP;
-        else
-                ret = -EOPNOTSUPP;
-out_fput:
+        return inode->i_op->fallocate(inode, mode, offset, len);
-        fput(file);
-out:
-        return ret;
 }
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+{
+        struct file *file;
+        int error = -EBADF;
+        file = fget(fd);
+        if (file) {
+                error = do_fallocate(file, mode, offset, len);
+                fput(file);
+        }
+        return error;
+}
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
 {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 1a9c7878f864..ea4e6cb29e13 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -436,7 +436,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        rcu_assign_pointer(ptbl->part[partno], p);
        /* suppress uevent if the disk supresses it */
-        if (!dev_get_uevent_suppress(pdev))
+        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
        return p;
diff --git a/fs/pipe.c b/fs/pipe.c
index f7dd21ad85a6..52c415114838 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
        } else {
-                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
        }
 }
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 63d965193b22..11a7b5c68153 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -18,6 +18,7 @@ proc-y	+= meminfo.o
 proc-y  += stat.o
 proc-y  += uptime.o
 proc-y  += version.o
+proc-y  += softirqs.o
 proc-$(CONFIG_PROC_SYSCTL)      += proc_sysctl.o
 proc-$(CONFIG_NET)              += proc_net.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1539e630c47d..175db258942f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-        struct mm_struct *mm = get_task_mm(task);
+        struct mm_struct *mm;
-        if (!mm)
+        if (mutex_lock_killable(&task->cred_guard_mutex))
                return NULL;
-        down_read(&mm->mmap_sem);
-        task_lock(task);
+        mm = get_task_mm(task);
-        if (task->mm != mm)
+        if (mm && mm != current->mm &&
-                goto out;
+                        !ptrace_may_access(task, PTRACE_MODE_READ)) {
-        if (task->mm != current->mm &&
+                mmput(mm);
-            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
+                mm = NULL;
-                goto out;
+        }
-        task_unlock(task);
+        mutex_unlock(&task->cred_guard_mutex);
        return mm;
-out:
-        task_unlock(task);
-        up_read(&mm->mmap_sem);
-        mmput(mm);
-        return NULL;
 }
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -1006,7 +1003,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
        if (!task)
                return -ESRCH;
-        oom_adjust = task->oomkilladj;
+        task_lock(task);
+        if (task->mm)
+                oom_adjust = task->mm->oom_adj;
+        else
+                oom_adjust = OOM_DISABLE;
+        task_unlock(task);
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1035,11 +1037,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        task = get_proc_task(file->f_path.dentry->d_inode);
        if (!task)
                return -ESRCH;
-        if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+        task_lock(task);
+        if (!task->mm) {
+                task_unlock(task);
+                put_task_struct(task);
+                return -EINVAL;
+        }
+        if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+                task_unlock(task);
                put_task_struct(task);
                return -EACCES;
        }
-        task->oomkilladj = oom_adjust;
+        task->mm->oom_adj = oom_adjust;
+        task_unlock(task);
        put_task_struct(task);
        if (end - buffer == 0)
                return -EIO;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c6b0302af4c4..d5c410d47fae 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                "Inactive(anon): %8lu kB\n"
                "Active(file):   %8lu kB\n"
                "Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
                "Unevictable:    %8lu kB\n"
                "Mlocked:        %8lu kB\n"
-#endif
 #ifdef CONFIG_HIGHMEM
                "HighTotal:      %8lu kB\n"
                "HighFree:       %8lu kB\n"
@@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(pages[LRU_INACTIVE_ANON]),
                K(pages[LRU_ACTIVE_FILE]),
                K(pages[LRU_INACTIVE_FILE]),
-#ifdef CONFIG_UNEVICTABLE_LRU
                K(pages[LRU_UNEVICTABLE]),
                K(global_page_state(NR_MLOCK)),
-#endif
 #ifdef CONFIG_HIGHMEM
                K(i.totalhigh),
                K(i.freehigh),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e9983837d08d..2707c6c7a20f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -6,11 +6,13 @@
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/hugetlb.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
 /* /proc/kpagecount - an array exposing page counts
 *
 * Each entry is a u64 representing the corresponding
@@ -32,20 +34,22 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                return -EINVAL;
        while (count > 0) {
-                ppage = NULL;
                if (pfn_valid(pfn))
                        ppage = pfn_to_page(pfn);
-                pfn++;
+                else
+                        ppage = NULL;
                if (!ppage)
                        pcount = 0;
                else
                        pcount = page_mapcount(ppage);
-                if (put_user(pcount, out++)) {
+                if (put_user(pcount, out)) {
                        ret = -EFAULT;
                        break;
                }
+                pfn++;
+                out++;
                count -= KPMSIZE;
        }
@@ -68,19 +72,122 @@ static const struct file_operations proc_kpagecount_operations = {
 /* These macros are used to decouple internal flags from exported ones */
-#define KPF_LOCKED     0
+#define KPF_LOCKED              0
-#define KPF_ERROR      1
+#define KPF_ERROR               1
-#define KPF_REFERENCED 2
+#define KPF_REFERENCED          2
-#define KPF_UPTODATE   3
+#define KPF_UPTODATE            3
-#define KPF_DIRTY      4
+#define KPF_DIRTY               4
-#define KPF_LRU        5
+#define KPF_LRU                 5
-#define KPF_ACTIVE     6
+#define KPF_ACTIVE              6
-#define KPF_SLAB       7
+#define KPF_SLAB                7
-#define KPF_WRITEBACK  8
+#define KPF_WRITEBACK           8
-#define KPF_RECLAIM    9
+#define KPF_RECLAIM             9
-#define KPF_BUDDY     10
+#define KPF_BUDDY               10
+/* 11-20: new additions in 2.6.31 */
+#define KPF_MMAP                11
+#define KPF_ANON                12
+#define KPF_SWAPCACHE           13
+#define KPF_SWAPBACKED          14
+#define KPF_COMPOUND_HEAD       15
+#define KPF_COMPOUND_TAIL       16
+#define KPF_HUGE                17
+#define KPF_UNEVICTABLE         18
+#define KPF_NOPAGE              20
+/* kernel hacking assistances
+ * WARNING: subject to change, never rely on them!
+ */
+#define KPF_RESERVED            32
+#define KPF_MLOCKED             33
+#define KPF_MAPPEDTODISK        34
+#define KPF_PRIVATE             35
+#define KPF_PRIVATE_2           36
+#define KPF_OWNER_PRIVATE       37
+#define KPF_ARCH                38
+#define KPF_UNCACHED            39
+static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
+{
+        return ((kflags >> kbit) & 1) << ubit;
+}
-#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
+static u64 get_uflags(struct page *page)
+{
+        u64 k;
+        u64 u;
+        /*
+         * pseudo flag: KPF_NOPAGE
+         * it differentiates a memory hole from a page with no flags
+         */
+        if (!page)
+                return 1 << KPF_NOPAGE;
+        k = page->flags;
+        u = 0;
+        /*
+         * pseudo flags for the well known (anonymous) memory mapped pages
+         *
+         * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+         * simple test in page_mapped() is not enough.
+         */
+        if (!PageSlab(page) && page_mapped(page))
+                u |= 1 << KPF_MMAP;
+        if (PageAnon(page))
+                u |= 1 << KPF_ANON;
+        /*
+         * compound pages: export both head/tail info
+         * they together define a compound page's start/end pos and order
+         */
+        if (PageHead(page))
+                u |= 1 << KPF_COMPOUND_HEAD;
+        if (PageTail(page))
+                u |= 1 << KPF_COMPOUND_TAIL;
+        if (PageHuge(page))
+                u |= 1 << KPF_HUGE;
+        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
+        /*
+         * Caveats on high order pages:
+         * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+         * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+         */
+        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
+        u |= kpf_copy_bit(k, KPF_BUDDY,         PG_buddy);
+        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
+        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
+        u |= kpf_copy_bit(k, KPF_UPTODATE,      PG_uptodate);
+        u |= kpf_copy_bit(k, KPF_WRITEBACK,     PG_writeback);
+        u |= kpf_copy_bit(k, KPF_LRU,           PG_lru);
+        u |= kpf_copy_bit(k, KPF_REFERENCED,    PG_referenced);
+        u |= kpf_copy_bit(k, KPF_ACTIVE,        PG_active);
+        u |= kpf_copy_bit(k, KPF_RECLAIM,       PG_reclaim);
+        u |= kpf_copy_bit(k, KPF_SWAPCACHE,     PG_swapcache);
+        u |= kpf_copy_bit(k, KPF_SWAPBACKED,    PG_swapbacked);
+        u |= kpf_copy_bit(k, KPF_UNEVICTABLE,   PG_unevictable);
+        u |= kpf_copy_bit(k, KPF_MLOCKED,       PG_mlocked);
+#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+        u |= kpf_copy_bit(k, KPF_UNCACHED,      PG_uncached);
+#endif
+        u |= kpf_copy_bit(k, KPF_RESERVED,      PG_reserved);
+        u |= kpf_copy_bit(k, KPF_MAPPEDTODISK,  PG_mappedtodisk);
+        u |= kpf_copy_bit(k, KPF_PRIVATE,       PG_private);
+        u |= kpf_copy_bit(k, KPF_PRIVATE_2,     PG_private_2);
+        u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
+        u |= kpf_copy_bit(k, KPF_ARCH,          PG_arch_1);
+        return u;
+};
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
                             size_t count, loff_t *ppos)
@@ -90,7 +197,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
        unsigned long src = *ppos;
        unsigned long pfn;
        ssize_t ret = 0;
-        u64 kflags, uflags;
        pfn = src / KPMSIZE;
        count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
@@ -98,32 +204,18 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                return -EINVAL;
        while (count > 0) {
-                ppage = NULL;
                if (pfn_valid(pfn))
                        ppage = pfn_to_page(pfn);
-                pfn++;
-                if (!ppage)
-                        kflags = 0;
                else
-                        kflags = ppage->flags;
+                        ppage = NULL;
-                uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
+                if (put_user(get_uflags(ppage), out)) {
-                        kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
-                        kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
-                        kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-                        kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
-                        kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
-                        kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
-                        kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
-                        kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
-                        kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
-                        kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
-                if (put_user(uflags, out++)) {
                        ret = -EFAULT;
                        break;
                }
+                pfn++;
+                out++;
                count -= KPMSIZE;
        }
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index fc6c3025befd..7ba79a54948c 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -195,20 +195,20 @@ void proc_device_tree_add_node(struct device_node *np,
                        p = fixup_name(np, de, p);
                ent = proc_mkdir(p, de);
-                if (ent == 0)
+                if (ent == NULL)
                        break;
                proc_device_tree_add_node(child, ent);
        }
        of_node_put(child);
-        for (pp = np->properties; pp != 0; pp = pp->next) {
+        for (pp = np->properties; pp != NULL; pp = pp->next) {
                p = pp->name;
                if (duplicate_name(de, p))
                        p = fixup_name(np, de, p);
                ent = __proc_device_tree_add_prop(de, pp, p);
-                if (ent == 0)
+                if (ent == NULL)
                        break;
        }
 }
@@ -221,10 +221,10 @@ void __init proc_device_tree_init(void)
        struct device_node *root;
        proc_device_tree = proc_mkdir("device-tree", NULL);
-        if (proc_device_tree == 0)
+        if (proc_device_tree == NULL)
                return;
        root = of_find_node_by_path("/");
-        if (root == 0) {
+        if (root == NULL) {
                printk(KERN_ERR "/proc/device-tree: can't find root\n");
                return;
        }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
new file mode 100644
index 000000000000..1807c2419f17
--- /dev/null
+++ b/fs/proc/softirqs.c
@@ -0,0 +1,44 @@
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+/*
+ * /proc/softirqs  ... display the number of softirqs
+ */
+static int show_softirqs(struct seq_file *p, void *v)
+{
+        int i, j;
+        seq_printf(p, "                ");
+        for_each_possible_cpu(i)
+                seq_printf(p, "CPU%-8d", i);
+        seq_printf(p, "\n");
+        for (i = 0; i < NR_SOFTIRQS; i++) {
+                seq_printf(p, "%8s:", softirq_to_name[i]);
+                for_each_possible_cpu(j)
+                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+                seq_printf(p, "\n");
+        }
+        return 0;
+}
+static int softirqs_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_softirqs, NULL);
+}
+static const struct file_operations proc_softirqs_operations = {
+        .open           = softirqs_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int __init proc_softirqs_init(void)
+{
+        proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+        return 0;
+}
+module_init(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81e4eb60972e..7cc726c6d70a 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -29,6 +29,8 @@ static int show_stat(struct seq_file *p, void *v)
        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
        cputime64_t guest;
        u64 sum = 0;
+        u64 sum_softirq = 0;
+        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
        struct timespec boottime;
        unsigned int per_irq_sum;
@@ -53,6 +55,13 @@ static int show_stat(struct seq_file *p, void *v)
                        sum += kstat_irqs_cpu(j, i);
                }
                sum += arch_irq_stat_cpu(i);
+                for (j = 0; j < NR_SOFTIRQS; j++) {
+                        unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+                        per_softirq_sums[j] += softirq_stat;
+                        sum_softirq += softirq_stat;
+                }
        }
        sum += arch_irq_stat();
@@ -115,6 +124,12 @@ static int show_stat(struct seq_file *p, void *v)
                nr_running(),
                nr_iowait());
+        seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+        for (i = 0; i < NR_SOFTIRQS; i++)
+                seq_printf(p, " %u", per_softirq_sums[i]);
+        seq_printf(p, "\n");
        return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        mm = mm_for_maps(priv->task);
        if (!mm)
                return NULL;
+        down_read(&mm->mmap_sem);
        tail_vma = get_gate_vma(priv->task);
        priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
                priv->task = NULL;
                return NULL;
        }
+        down_read(&mm->mmap_sem);
        /* start from the Nth VMA */
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5edcc3f92ba7..0872afa58d39 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -166,12 +166,7 @@ static const struct file_operations proc_vmcore_operations = {
 static struct vmcore* __init get_new_element(void)
 {
-        struct vmcore *p;
+        return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
-        p = kmalloc(sizeof(*p), GFP_KERNEL);
-        if (p)
-                memset(p, 0, sizeof(*p));
-        return p;
 }
 static u64 __init get_vmcore_size_elf64(char *elfptr)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 607c579e5eca..38f7bd559f35 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2042,7 +2042,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                 * changes */
                invalidate_bdev(sb->s_bdev);
        }
-        mutex_lock(&inode->i_mutex);
        mutex_lock(&dqopt->dqonoff_mutex);
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
@@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
                                             S_NOQUOTA);
                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
                sb->dq_op->drop(inode);
        }
@@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                goto out_file_init;
        }
        mutex_unlock(&dqopt->dqio_mutex);
-        mutex_unlock(&inode->i_mutex);
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);
@@ -2094,16 +2094,17 @@ out_file_init:
        dqopt->files[type] = NULL;
        iput(inode);
 out_lock:
-        mutex_unlock(&dqopt->dqonoff_mutex);
        if (oldflags != -1) {
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                /* Set the flags back (in the case of accidental quotaon()
                 * on a wrong file we don't want to mess up the flags) */
                inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
                inode->i_flags |= oldflags;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
        }
-        mutex_unlock(&inode->i_mutex);
+        mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
        put_quota_format(fmt);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ebb2c417912c..11f0c06316de 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -20,6 +20,7 @@
 #include <linux/ramfs.h>
 #include <linux/pagevec.h>
 #include <linux/mman.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3a6b193d8444..0ff7566c767c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -202,9 +202,12 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
                                return -EINVAL;
                        opts->mode = option & S_IALLUGO;
                        break;
-                default:
+                /*
-                        printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+                 * We might like to report bad mount options here;
-                        return -EINVAL;
+                 * but traditionally ramfs has ignored all mount options,
+                 * and as it is used as a !CONFIG_SHMEM simple substitute
+                 * for tmpfs, better continue to ignore other mount options.
+                 */
                }
        }
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4beb964a2a3e..128d3f7c8aa5 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -1270,9 +1270,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        RFALSE(ih, "PAP-12210: ih must be 0");
-                                        if (is_direntry_le_ih
+                                        aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
-                                            (aux_ih =
+                                        if (is_direntry_le_ih(aux_ih)) {
-                                             B_N_PITEM_HEAD(tbS0, item_pos))) {
                                                /* we append to directory item */
                                                int entry_count;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6fd0f47e45db..a14d6cd9eeda 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1131,8 +1131,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
        REISERFS_I(inode)->i_trans_id = 0;
        REISERFS_I(inode)->i_jl = NULL;
        mutex_init(&(REISERFS_I(inode)->i_mmap));
-        reiserfs_init_acl_access(inode);
-        reiserfs_init_acl_default(inode);
        reiserfs_init_xattr_rwsem(inode);
        if (stat_data_v1(ih)) {
@@ -1834,8 +1832,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
            REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
        sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
        mutex_init(&(REISERFS_I(inode)->i_mmap));
-        reiserfs_init_acl_access(inode);
-        reiserfs_init_acl_default(inode);
        reiserfs_init_xattr_rwsem(inode);
        /* key to search for correct place for new stat data */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 77f5bb746bf0..90622200b39c 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -997,7 +997,7 @@ static int reiserfs_async_progress_wait(struct super_block *s)
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
        if (atomic_read(&j->j_async_throttle))
-                congestion_wait(WRITE, HZ / 10);
+                congestion_wait(BLK_RW_ASYNC, HZ / 10);
        return 0;
 }
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 381750a155f6..03d85cbf90bf 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -390,7 +390,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
        if (last_first == FIRST_TO_LAST) {
                /* if ( if item in position item_num in buffer SOURCE is directory item ) */
-                if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+                ih = B_N_PITEM_HEAD(src, item_num);
+                if (is_direntry_le_ih(ih))
                        leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
                                              item_num, 0, cpy_bytes);
                else {
@@ -418,7 +419,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
                }
        } else {
                /*  if ( if item in position item_num in buffer SOURCE is directory item ) */
-                if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+                ih = B_N_PITEM_HEAD(src, item_num);
+                if (is_direntry_le_ih(ih))
                        leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
                                              item_num,
                                              I_ENTRY_COUNT(ih) - cpy_bytes,
@@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
                        leaf_delete_items_entirely(cur_bi, first + 1,
                                                   del_num - 1);
-                        if (is_direntry_le_ih
+                        ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
-                            (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1)))
+                        if (is_direntry_le_ih(ih))
                                /* the last item is directory  */
                                /* len = numbers of directory entries in this item */
                                len = ih_entry_count(ih);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 238e9d9b31e0..18b315d3d104 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -82,7 +82,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
                        printk
                            ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
-                        unlock_super(s);
                        return -ENOMEM;
                }
                /* the new journal bitmaps are zero filled, now we copy in the bitmap
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2969773cfc22..7adea74d6a8a 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -24,7 +24,6 @@
 #include <linux/exportfs.h>
 #include <linux/quotaops.h>
 #include <linux/vfs.h>
-#include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
@@ -529,10 +528,6 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        inode_init_once(&ei->vfs_inode);
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-        ei->i_acl_access = NULL;
-        ei->i_acl_default = NULL;
-#endif
 }
 static int init_inodecache(void)
@@ -580,25 +575,6 @@ static void reiserfs_dirty_inode(struct inode *inode)
        reiserfs_write_unlock(inode->i_sb);
 }
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-static void reiserfs_clear_inode(struct inode *inode)
-{
-        struct posix_acl *acl;
-        acl = REISERFS_I(inode)->i_acl_access;
-        if (acl && !IS_ERR(acl))
-                posix_acl_release(acl);
-        REISERFS_I(inode)->i_acl_access = NULL;
-        acl = REISERFS_I(inode)->i_acl_default;
-        if (acl && !IS_ERR(acl))
-                posix_acl_release(acl);
-        REISERFS_I(inode)->i_acl_default = NULL;
-}
-#else
-#define reiserfs_clear_inode NULL
-#endif
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
                                    size_t, loff_t);
@@ -612,7 +588,6 @@ static const struct super_operations reiserfs_sops = {
        .write_inode = reiserfs_write_inode,
        .dirty_inode = reiserfs_dirty_inode,
        .delete_inode = reiserfs_delete_inode,
-        .clear_inode = reiserfs_clear_inode,
        .put_super = reiserfs_put_super,
        .write_super = reiserfs_write_super,
        .sync_fs = reiserfs_sync_fs,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f3d47d856848..6925b835a43b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -46,7 +46,6 @@
 #include <linux/reiserfs_acl.h>
 #include <asm/uaccess.h>
 #include <net/checksum.h>
-#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/quotaops.h>
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index c303c426fe2b..35d6e672a279 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -188,29 +188,6 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
        return ERR_PTR(-EINVAL);
 }
-static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                            struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != ERR_PTR(-ENODATA))
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
-static inline struct posix_acl *iget_acl(struct inode *inode,
-                                         struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = ERR_PTR(-ENODATA);
-        spin_lock(&inode->i_lock);
-        if (*i_acl != ERR_PTR(-ENODATA))
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
 /*
 * Inode operation get_posix_acl().
 *
@@ -220,34 +197,29 @@ static inline struct posix_acl *iget_acl(struct inode *inode,
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 {
        char *name, *value;
-        struct posix_acl *acl, **p_acl;
+        struct posix_acl *acl;
        int size;
        int retval;
-        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &reiserfs_i->i_acl_access;
                break;
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &reiserfs_i->i_acl_default;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
-        acl = iget_acl(inode, p_acl);
-        if (acl && !IS_ERR(acl))
-                return acl;
-        else if (PTR_ERR(acl) == -ENODATA)
-                return NULL;
        size = reiserfs_xattr_get(inode, name, NULL, 0);
        if (size < 0) {
                if (size == -ENODATA || size == -ENOSYS) {
-                        *p_acl = ERR_PTR(-ENODATA);
+                        set_cached_acl(inode, type, NULL);
                        return NULL;
                }
                return ERR_PTR(size);
@@ -262,14 +234,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
                /* This shouldn't actually happen as it should have
                   been caught above.. but just in case */
                acl = NULL;
-                *p_acl = ERR_PTR(-ENODATA);
        } else if (retval < 0) {
                acl = ERR_PTR(retval);
        } else {
                acl = posix_acl_from_disk(value, retval);
-                if (!IS_ERR(acl))
-                        iset_acl(inode, p_acl, acl);
        }
+        if (!IS_ERR(acl))
+                set_cached_acl(inode, type, acl);
        kfree(value);
        return acl;
@@ -287,10 +258,8 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 {
        char *name;
        void *value = NULL;
-        struct posix_acl **p_acl;
        size_t size = 0;
        int error;
-        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
@@ -298,7 +267,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &reiserfs_i->i_acl_access;
                if (acl) {
                        mode_t mode = inode->i_mode;
                        error = posix_acl_equiv_mode(acl, &mode);
@@ -313,7 +281,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
                break;
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &reiserfs_i->i_acl_default;
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EACCES : 0;
                break;
@@ -346,7 +313,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
        kfree(value);
        if (!error)
-                iset_acl(inode, p_acl, acl);
+                set_cached_acl(inode, type, acl);
        return error;
 }
@@ -379,11 +346,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
        }
        acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
-        if (IS_ERR(acl)) {
+        if (IS_ERR(acl))
-                if (PTR_ERR(acl) == -ENODATA)
-                        goto apply_umask;
                return PTR_ERR(acl);
-        }
        if (acl) {
                struct posix_acl *acl_copy;
diff --git a/fs/select.c b/fs/select.c
index 0fe0e1469df3..d870237e42c7 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -168,7 +168,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
        return table->entry++;
 }
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
        struct poll_wqueues *pwq = wait->private;
        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -194,6 +194,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
        return default_wake_function(&dummy_wait, mode, sync, key);
 }
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+        struct poll_table_entry *entry;
+        entry = container_of(wait, struct poll_table_entry, wait);
+        if (key && !((unsigned long)key & entry->key))
+                return 0;
+        return __pollwake(wait, mode, sync, key);
+}
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
@@ -205,6 +215,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
        get_file(filp);
        entry->filp = filp;
        entry->wait_address = wait_address;
+        entry->key = p->key;
        init_waitqueue_func_entry(&entry->wait, pollwake);
        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
@@ -362,6 +373,18 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
+static inline void wait_key_set(poll_table *wait, unsigned long in,
+                                unsigned long out, unsigned long bit)
+{
+        if (wait) {
+                wait->key = POLLEX_SET;
+                if (in & bit)
+                        wait->key |= POLLIN_SET;
+                if (out & bit)
+                        wait->key |= POLLOUT_SET;
+        }
+}
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
        ktime_t expire, *to = NULL;
@@ -418,20 +441,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                                if (file) {
                                        f_op = file->f_op;
                                        mask = DEFAULT_POLLMASK;
-                                        if (f_op && f_op->poll)
+                                        if (f_op && f_op->poll) {
-                                                mask = (*f_op->poll)(file, retval ? NULL : wait);
+                                                wait_key_set(wait, in, out, bit);
+                                                mask = (*f_op->poll)(file, wait);
+                                        }
                                        fput_light(file, fput_needed);
                                        if ((mask & POLLIN_SET) && (in & bit)) {
                                                res_in |= bit;
                                                retval++;
+                                                wait = NULL;
                                        }
                                        if ((mask & POLLOUT_SET) && (out & bit)) {
                                                res_out |= bit;
                                                retval++;
+                                                wait = NULL;
                                        }
                                        if ((mask & POLLEX_SET) && (ex & bit)) {
                                                res_ex |= bit;
                                                retval++;
+                                                wait = NULL;
                                        }
                                }
                        }
@@ -685,8 +713,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
                mask = POLLNVAL;
                if (file != NULL) {
                        mask = DEFAULT_POLLMASK;
-                        if (file->f_op && file->f_op->poll)
+                        if (file->f_op && file->f_op->poll) {
+                                if (pwait)
+                                        pwait->key = pollfd->events |
+                                                        POLLERR | POLLHUP;
                                mask = file->f_op->poll(file, pwait);
+                        }
                        /* Mask out unneeded events. */
                        mask &= pollfd->events | POLLERR | POLLHUP;
                        fput_light(file, fput_needed);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7f40f30c55c5..6c959275f2d0 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -640,6 +640,26 @@ int seq_puts(struct seq_file *m, const char *s)
 }
 EXPORT_SYMBOL(seq_puts);
+/**
+ * seq_write - write arbitrary data to buffer
+ * @seq: seq_file identifying the buffer to which data should be written
+ * @data: data address
+ * @len: number of bytes
+ *
+ * Return 0 on success, non-zero otherwise.
+ */
+int seq_write(struct seq_file *seq, const void *data, size_t len)
+{
+        if (seq->count + len < seq->size) {
+                memcpy(seq->buf + seq->count, data, len);
+                seq->count += len;
+                return 0;
+        }
+        seq->count = seq->size;
+        return -1;
+}
+EXPORT_SYMBOL(seq_write);
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
        struct list_head *lh;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 3b52770f46ff..cb5fc57e370b 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
diff --git a/fs/super.c b/fs/super.c
index 83b47416d006..2761d3e22ed9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -545,24 +545,18 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
                if (force)
                        mark_files_ro(sb);
-                else if (!fs_may_remount_ro(sb)) {
+                else if (!fs_may_remount_ro(sb))
-                        unlock_kernel();
                        return -EBUSY;
-                }
                retval = vfs_dq_off(sb, 1);
-                if (retval < 0 && retval != -ENOSYS) {
+                if (retval < 0 && retval != -ENOSYS)
-                        unlock_kernel();
                        return -EBUSY;
-                }
        }
        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        if (sb->s_op->remount_fs) {
                retval = sb->s_op->remount_fs(sb, &flags, data);
-                if (retval) {
+                if (retval)
-                        unlock_kernel();
                        return retval;
-                }
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
        if (remount_rw)
@@ -614,6 +608,7 @@ void emergency_remount(void)
 static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
+static int unnamed_dev_start = 0; /* don't bother trying below it */
 int set_anon_super(struct super_block *s, void *data)
 {
@@ -624,7 +619,9 @@ int set_anon_super(struct super_block *s, void *data)
        if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
                return -ENOMEM;
        spin_lock(&unnamed_dev_lock);
-        error = ida_get_new(&unnamed_dev_ida, &dev);
+        error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
+        if (!error)
+                unnamed_dev_start = dev + 1;
        spin_unlock(&unnamed_dev_lock);
        if (error == -EAGAIN)
                /* We raced and lost with another CPU. */
@@ -635,6 +632,8 @@ int set_anon_super(struct super_block *s, void *data)
        if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
                spin_lock(&unnamed_dev_lock);
                ida_remove(&unnamed_dev_ida, dev);
+                if (unnamed_dev_start > dev)
+                        unnamed_dev_start = dev;
                spin_unlock(&unnamed_dev_lock);
                return -EMFILE;
        }
@@ -651,6 +650,8 @@ void kill_anon_super(struct super_block *sb)
        generic_shutdown_super(sb);
        spin_lock(&unnamed_dev_lock);
        ida_remove(&unnamed_dev_ida, slot);
+        if (slot < unnamed_dev_start)
+                unnamed_dev_start = slot;
        spin_unlock(&unnamed_dev_lock);
 }
diff --git a/fs/sync.c b/fs/sync.c
index dd200025af85..3422ba61d86d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -112,8 +112,13 @@ restart:
        mutex_unlock(&mutex);
 }
+/*
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
+ */
 SYSCALL_DEFINE0(sync)
 {
+        wakeup_pdflush(0);
        sync_filesystems(0);
        sync_filesystems(1);
        if (unlikely(laptop_mode))
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 9345806c8853..2524714bece1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -171,6 +171,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        if (count > 0)
                *off = offs + count;
+        kfree(temp);
        return count;
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d88d0fac9fa5..14f2d71ea3ce 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -939,8 +939,10 @@ again:
        /* Remove from old parent's list and insert into new parent's list. */
        sysfs_unlink_sibling(sd);
        sysfs_get(new_parent_sd);
+        drop_nlink(old_parent->d_inode);
        sysfs_put(sd->s_parent);
        sd->s_parent = new_parent_sd;
+        inc_nlink(new_parent->d_inode);
        sysfs_link_sibling(sd);
 out_unlock:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index a3ba217fbe74..1d897ad808e0 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -192,8 +192,11 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        int error = -ENOMEM;
        unsigned long page = get_zeroed_page(GFP_KERNEL);
-        if (page)
+        if (page) {
                error = sysfs_getlink(dentry, (char *) page); 
+                if (error < 0)
+                        free_page((unsigned long)page);
+        }
        nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
        return NULL;
 }
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index c7798079e644..4e50286a4cc3 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -15,13 +15,13 @@
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include "sysv.h"
 static int sysv_readdir(struct file *, void *, filldir_t);
 const struct file_operations sysv_dir_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
        .fsync          = simple_fsync,
@@ -74,8 +74,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
-        lock_kernel();
        pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
        if (pos >= inode->i_size)
                goto done;
@@ -113,7 +111,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 done:
        filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 479923456a54..9824743832a7 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -21,7 +21,6 @@
 *  the superblock.
 */
-#include <linux/smp_lock.h>
 #include <linux/highuid.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -37,7 +36,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
        unsigned long time = get_seconds(), old_time;
        lock_super(sb);
-        lock_kernel();
        /*
         * If we are going to write out the super block,
@@ -52,7 +50,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
                mark_buffer_dirty(sbi->s_bh2);
        }
-        unlock_kernel();
        unlock_super(sb);
        return 0;
@@ -82,8 +79,6 @@ static void sysv_put_super(struct super_block *sb)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
-        lock_kernel();
        if (sb->s_dirt)
                sysv_write_super(sb);
@@ -99,8 +94,6 @@ static void sysv_put_super(struct super_block *sb)
                brelse(sbi->s_bh2);
        kfree(sbi);
-        unlock_kernel();
 }
 static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -275,7 +268,6 @@ int sysv_write_inode(struct inode *inode, int wait)
                return -EIO;
        }
-        lock_kernel();
        raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
        raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
        raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -291,7 +283,6 @@ int sysv_write_inode(struct inode *inode, int wait)
        for (block = 0; block < 10+1+1+1; block++)
                write3byte(sbi, (u8 *)&si->i_data[block],
                        &raw_inode->i_data[3*block]);
-        unlock_kernel();
        mark_buffer_dirty(bh);
        if (wait) {
                sync_dirty_buffer(bh);
@@ -315,9 +306,7 @@ static void sysv_delete_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
        sysv_truncate(inode);
-        lock_kernel();
        sysv_free_inode(inode);
-        unlock_kernel();
 }
 static struct kmem_cache *sysv_inode_cachep;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index af1914462f02..eaf6d891d46f 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -91,7 +91,6 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write)
        return nr_written;
 }
 /**
 * run_gc - run garbage collector.
 * @c: UBIFS file-system description object
@@ -628,7 +627,7 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 *
 * This function releases budget corresponding to a dirty inode. It is usually
 * called when after the inode has been written to the media and marked as
- * clean.
+ * clean. It also causes the "no space" flags to be cleared.
 */
 void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
                                      struct ubifs_inode *ui)
@@ -636,6 +635,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
        struct ubifs_budget_req req;
        memset(&req, 0, sizeof(struct ubifs_budget_req));
+        /* The "no space" flags will be cleared because dd_growth is > 0 */
        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f55d523c52bb..552fb0111fff 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -528,6 +528,25 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
                inode->i_nlink, dir->i_ino);
        ubifs_assert(mutex_is_locked(&dir->i_mutex));
        ubifs_assert(mutex_is_locked(&inode->i_mutex));
+        /*
+         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
+         * otherwise has the potential to corrupt the orphan inode list.
+         *
+         * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
+         * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
+         * lock 'dirA->i_mutex', so this is possible. Both of the functions
+         * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
+         * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
+         * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
+         * to the list of orphans. After this, 'vfs_link()' will link
+         * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
+         * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
+         * to the list of orphans.
+         */
+         if (inode->i_nlink == 0)
+                 return -ENOENT;
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e8e632a1dcdf..762a7d6cec73 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -293,13 +293,15 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
 *
 * This function is called when the write-buffer timer expires.
 */
-static void wbuf_timer_callback_nolock(unsigned long data)
+static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
 {
-        struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
+        struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
+        dbg_io("jhead %d", wbuf->jhead);
        wbuf->need_sync = 1;
        wbuf->c->need_wbuf_sync = 1;
        ubifs_wake_up_bgt(wbuf->c);
+        return HRTIMER_NORESTART;
 }
 /**
@@ -308,13 +310,16 @@ static void wbuf_timer_callback_nolock(unsigned long data)
 */
 static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 {
-        ubifs_assert(!timer_pending(&wbuf->timer));
+        ubifs_assert(!hrtimer_active(&wbuf->timer));
-        if (!wbuf->timeout)
+        if (wbuf->no_timer)
                return;
+        dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead,
-        wbuf->timer.expires = jiffies + wbuf->timeout;
+               div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
-        add_timer(&wbuf->timer);
+               div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
+                       USEC_PER_SEC));
+        hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+                               HRTIMER_MODE_REL);
 }
 /**
@@ -323,13 +328,10 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 */
 static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 {
-        /*
+        if (wbuf->no_timer)
-         * If the syncer is waiting for the lock (from the background thread's
+                return;
-         * context) and another task is changing write-buffer then the syncing
-         * should be canceled.
-         */
        wbuf->need_sync = 0;
-        del_timer(&wbuf->timer);
+        hrtimer_cancel(&wbuf->timer);
 }
 /**
@@ -349,8 +351,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
                /* Write-buffer is empty or not seeked */
                return 0;
-        dbg_io("LEB %d:%d, %d bytes",
+        dbg_io("LEB %d:%d, %d bytes, jhead %d",
-               wbuf->lnum, wbuf->offs, wbuf->used);
+               wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead);
        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
        ubifs_assert(!(wbuf->avail & 7));
        ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -390,7 +392,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 * @offs: logical eraseblock offset to seek to
 * @dtype: data type
 *
- * This function targets the write buffer to logical eraseblock @lnum:@offs.
+ * This function targets the write-buffer to logical eraseblock @lnum:@offs.
 * The write-buffer is synchronized if it is not empty. Returns zero in case of
 * success and a negative error code in case of failure.
 */
@@ -399,7 +401,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
 {
        const struct ubifs_info *c = wbuf->c;
-        dbg_io("LEB %d:%d", lnum, offs);
+        dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead);
        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
        ubifs_assert(offs >= 0 && offs <= c->leb_size);
        ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -506,9 +508,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        struct ubifs_info *c = wbuf->c;
        int err, written, n, aligned_len = ALIGN(len, 8), offs;
-        dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
+        dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len,
-               dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
+               dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead,
-               wbuf->offs + wbuf->used);
+               wbuf->lnum, wbuf->offs + wbuf->used);
        ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
        ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
        ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -533,8 +535,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                memcpy(wbuf->buf + wbuf->used, buf, len);
                if (aligned_len == wbuf->avail) {
-                        dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
+                        dbg_io("flush jhead %d wbuf to LEB %d:%d",
-                                wbuf->offs);
+                               wbuf->jhead, wbuf->lnum, wbuf->offs);
                        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
                                            wbuf->offs, c->min_io_size,
                                            wbuf->dtype);
@@ -562,7 +564,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
         * minimal I/O unit. We have to fill and flush write-buffer and switch
         * to the next min. I/O unit.
         */
-        dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
+        dbg_io("flush jhead %d wbuf to LEB %d:%d",
+               wbuf->jhead, wbuf->lnum, wbuf->offs);
        memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
                            c->min_io_size, wbuf->dtype);
@@ -695,7 +698,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
        int err, rlen, overlap;
        struct ubifs_ch *ch = buf;
-        dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+        dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs,
+               dbg_ntype(type), len, wbuf->jhead);
        ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
        ubifs_assert(!(offs & 7) && offs < c->leb_size);
        ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
@@ -819,7 +823,7 @@ out:
 * @c: UBIFS file-system description object
 * @wbuf: write-buffer to initialize
 *
- * This function initializes write buffer. Returns zero in case of success
+ * This function initializes write-buffer. Returns zero in case of success
 * %-ENOMEM in case of failure.
 */
 int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
@@ -845,20 +849,21 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
        wbuf->sync_callback = NULL;
        mutex_init(&wbuf->io_mutex);
        spin_lock_init(&wbuf->lock);
        wbuf->c = c;
-        init_timer(&wbuf->timer);
-        wbuf->timer.function = wbuf_timer_callback_nolock;
-        wbuf->timer.data = (unsigned long)wbuf;
-        wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
        wbuf->next_ino = 0;
+        hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        wbuf->timer.function = wbuf_timer_callback_nolock;
+        wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
+        wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
+        wbuf->delta *= 1000000000ULL;
+        ubifs_assert(wbuf->delta <= ULONG_MAX);
        return 0;
 }
 /**
 * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
- * @wbuf: the write-buffer whereto add
+ * @wbuf: the write-buffer where to add
 * @inum: the inode number
 *
 * This function adds an inode number to the inode array of the write-buffer.
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 6db7a6be6c97..8aacd64957a2 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -25,7 +25,6 @@
 /* This file implements EXT2-compatible extended attribute ioctl() calls */
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include "ubifs.h"
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 10662975d2ef..e5f6cf8a1155 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -53,6 +53,25 @@ static int is_empty(void *buf, int len)
 }
 /**
+ * first_non_ff - find offset of the first non-0xff byte.
+ * @buf: buffer to search in
+ * @len: length of buffer
+ *
+ * This function returns offset of the first non-0xff byte in @buf or %-1 if
+ * the buffer contains only 0xff bytes.
+ */
+static int first_non_ff(void *buf, int len)
+{
+        uint8_t *p = buf;
+        int i;
+        for (i = 0; i < len; i++)
+                if (*p++ != 0xff)
+                        return i;
+        return -1;
+}
+/**
 * get_master_node - get the last valid master node allowing for corruption.
 * @c: UBIFS file-system description object
 * @lnum: LEB number
@@ -343,43 +362,21 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
 *
 * This function returns %1 if @offs was in the last write to the LEB whose data
 * is in @buf, otherwise %0 is returned.  The determination is made by checking
- * for subsequent empty space starting from the next min_io_size boundary (or a
+ * for subsequent empty space starting from the next @c->min_io_size boundary.
- * bit less than the common header size if min_io_size is one).
 */
 static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 {
-        int empty_offs;
+        int empty_offs, check_len;
-        int check_len;
        uint8_t *p;
-        if (c->min_io_size == 1) {
-                check_len = c->leb_size - offs;
-                p = buf + check_len;
-                for (; check_len > 0; check_len--)
-                        if (*--p != 0xff)
-                                break;
-                /*
-                 * 'check_len' is the size of the corruption which cannot be
-                 * more than the size of 1 node if it was caused by an unclean
-                 * unmount.
-                 */
-                if (check_len > UBIFS_MAX_NODE_SZ)
-                        return 0;
-                return 1;
-        }
        /*
-         * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
+         * Round up to the next @c->min_io_size boundary i.e. @offs is in the
         * last wbuf written. After that should be empty space.
         */
        empty_offs = ALIGN(offs + 1, c->min_io_size);
        check_len = c->leb_size - empty_offs;
        p = buf + empty_offs - offs;
+        return is_empty(p, check_len);
-        for (; check_len > 0; check_len--)
-                if (*p++ != 0xff)
-                        return 0;
-        return 1;
 }
 /**
@@ -392,7 +389,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 *
 * This function pads up to the next min_io_size boundary (if there is one) and
 * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
- * min_io_size boundary (if there is one).
+ * @c->min_io_size boundary.
 */
 static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
                      int *offs, int *len)
@@ -402,11 +399,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
        lnum = lnum;
        dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
-        if (c->min_io_size == 1) {
-                memset(*buf, 0xff, c->leb_size - *offs);
-                return;
-        }
        ubifs_assert(!(*offs & 7));
        empty_offs = ALIGN(*offs, c->min_io_size);
        pad_len = empty_offs - *offs;
@@ -566,8 +558,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 *
 * This function does a scan of a LEB, but caters for errors that might have
 * been caused by the unclean unmount from which we are attempting to recover.
- *
+ * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
- * This function returns %0 on success and a negative error code on failure.
+ * found, and a negative error code in case of failure.
 */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int grouped)
@@ -666,7 +658,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                        goto corrupted;
                default:
                        dbg_err("unknown");
-                        goto corrupted;
+                        err = -EINVAL;
+                        goto error;
                }
        }
@@ -675,8 +668,13 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                        clean_buf(c, &buf, lnum, &offs, &len);
                        need_clean = 1;
                } else {
-                        ubifs_err("corrupt empty space at LEB %d:%d",
+                        int corruption = first_non_ff(buf, len);
-                                  lnum, offs);
+                        ubifs_err("corrupt empty space LEB %d:%d, corruption "
+                                  "starts at %d", lnum, offs, corruption);
+                        /* Make sure we dump interesting non-0xFF data */
+                        offs = corruption;
+                        buf += corruption;
                        goto corrupted;
                }
        }
@@ -836,7 +834,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
 static int recover_head(const struct ubifs_info *c, int lnum, int offs,
                        void *sbuf)
 {
-        int len, err, need_clean = 0;
+        int len, err;
        if (c->min_io_size > 1)
                len = c->min_io_size;
@@ -850,19 +848,7 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs,
        /* Read at the head location and check it is empty flash */
        err = ubi_read(c->ubi, lnum, sbuf, offs, len);
-        if (err)
+        if (err || !is_empty(sbuf, len)) {
-                need_clean = 1;
-        else {
-                uint8_t *p = sbuf;
-                while (len--)
-                        if (*p++ != 0xff) {
-                                need_clean = 1;
-                                break;
-                        }
-        }
-        if (need_clean) {
                dbg_rcvry("cleaning head at %d:%d", lnum, offs);
                if (offs == 0)
                        return ubifs_leb_unmap(c, lnum);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 11cc80125a49..2970500f32df 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -837,9 +837,10 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        dbg_mnt("replay log LEB %d:%d", lnum, offs);
        sleb = ubifs_scan(c, lnum, offs, sbuf);
-        if (IS_ERR(sleb)) {
+        if (IS_ERR(sleb) ) {
-                if (c->need_recovery)
+                if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
-                        sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+                        return PTR_ERR(sleb);
+                sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
                if (IS_ERR(sleb))
                        return PTR_ERR(sleb);
        }
@@ -957,7 +958,7 @@ out:
        return err;
 out_dump:
-        ubifs_err("log error detected while replying the log at LEB %d:%d",
+        ubifs_err("log error detected while replaying the log at LEB %d:%d",
                  lnum, offs + snod->offs);
        dbg_dump_node(c, snod->node);
        ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 0ed82479b44b..892ebfee4fe5 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -238,12 +238,12 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
 {
        int len;
-        ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
+        ubifs_err("corruption at LEB %d:%d", lnum, offs);
        if (dbg_failure_mode)
                return;
        len = c->leb_size - offs;
-        if (len > 4096)
+        if (len > 8192)
-                len = 4096;
+                len = 8192;
        dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
        print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
 }
@@ -256,7 +256,9 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
 * @sbuf: scan buffer (must be c->leb_size)
 *
 * This function scans LEB number @lnum and returns complete information about
- * its contents. Returns an error code in case of failure.
+ * its contents. Returns the scaned information in case of success and,
+ * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
+ * of failure.
 */
 struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                                  int offs, void *sbuf)
@@ -279,7 +281,6 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                cond_resched();
                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
                if (ret > 0) {
                        /* Padding bytes or a valid padding node */
                        offs += ret;
@@ -304,7 +305,8 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                        goto corrupted;
                default:
                        dbg_err("unknown");
-                        goto corrupted;
+                        err = -EINVAL;
+                        goto error;
                }
                err = ubifs_add_snod(c, sleb, buf, offs);
@@ -317,8 +319,10 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                len -= node_len;
        }
-        if (offs % c->min_io_size)
+        if (offs % c->min_io_size) {
-                goto corrupted;
+                ubifs_err("empty space starts at non-aligned offset %d", offs);
+                goto corrupted;;
+        }
        ubifs_end_scan(c, sleb, lnum, offs);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3589eab02a2f..26d2e0d80465 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -361,6 +361,11 @@ static void ubifs_delete_inode(struct inode *inode)
 out:
        if (ui->dirty)
                ubifs_release_dirty_inode_budget(c, ui);
+        else {
+                /* We've deleted something - clean the "no space" flags */
+                c->nospace = c->nospace_rp = 0;
+                smp_wmb();
+        }
        clear_inode(inode);
 }
@@ -792,7 +797,7 @@ static int alloc_wbufs(struct ubifs_info *c)
         * does not need to be synchronized by timer.
         */
        c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
-        c->jheads[GCHD].wbuf.timeout = 0;
+        c->jheads[GCHD].wbuf.no_timer = 1;
        return 0;
 }
@@ -933,6 +938,27 @@ static const match_table_t tokens = {
 };
 /**
+ * parse_standard_option - parse a standard mount option.
+ * @option: the option to parse
+ *
+ * Normally, standard mount options like "sync" are passed to file-systems as
+ * flags. However, when a "rootflags=" kernel boot parameter is used, they may
+ * be present in the options string. This function tries to deal with this
+ * situation and parse standard options. Returns 0 if the option was not
+ * recognized, and the corresponding integer flag if it was.
+ *
+ * UBIFS is only interested in the "sync" option, so do not check for anything
+ * else.
+ */
+static int parse_standard_option(const char *option)
+{
+        ubifs_msg("parse %s", option);
+        if (!strcmp(option, "sync"))
+                return MS_SYNCHRONOUS;
+        return 0;
+}
+/**
 * ubifs_parse_options - parse mount parameters.
 * @c: UBIFS file-system description object
 * @options: parameters to parse
@@ -960,7 +986,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                switch (token) {
                /*
                 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
-                 * We accepte them in order to be backware-compatible. But this
+                 * We accept them in order to be backward-compatible. But this
                 * should be removed at some point.
                 */
                case Opt_fast_unmount:
@@ -1008,9 +1034,19 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                        break;
                }
                default:
-                        ubifs_err("unrecognized mount option \"%s\" "
+                {
-                                  "or missing value", p);
+                        unsigned long flag;
-                        return -EINVAL;
+                        struct super_block *sb = c->vfs_sb;
+                        flag = parse_standard_option(p);
+                        if (!flag) {
+                                ubifs_err("unrecognized mount option \"%s\" "
+                                          "or missing value", p);
+                                return -EINVAL;
+                        }
+                        sb->s_flags |= flag;
+                        break;
+                }
                }
        }
@@ -1180,6 +1216,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (!ubifs_compr_present(c->default_compr)) {
                ubifs_err("'compressor \"%s\" is not compiled in",
                          ubifs_compr_name(c->default_compr));
+                err = -ENOTSUPP;
                goto out_free;
        }
@@ -1250,6 +1287,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_journal;
+        /* Calculate 'min_idx_lebs' after journal replay */
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
        if (err)
                goto out_orphans;
@@ -1656,7 +1696,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        for (i = 0; i < c->jhead_cnt; i++) {
                ubifs_wbuf_sync(&c->jheads[i].wbuf);
-                del_timer_sync(&c->jheads[i].wbuf.timer);
+                hrtimer_cancel(&c->jheads[i].wbuf.timer);
        }
        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
@@ -1717,10 +1757,8 @@ static void ubifs_put_super(struct super_block *sb)
                /* Synchronize write-buffers */
                if (c->jheads)
-                        for (i = 0; i < c->jhead_cnt; i++) {
+                        for (i = 0; i < c->jhead_cnt; i++)
                                ubifs_wbuf_sync(&c->jheads[i].wbuf);
-                                del_timer_sync(&c->jheads[i].wbuf.timer);
-                        }
                /*
                 * On fatal errors c->ro_media is set to 1, in which case we do
@@ -1911,6 +1949,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&c->orph_list);
        INIT_LIST_HEAD(&c->orph_new);
+        c->vfs_sb = sb;
        c->highest_inum = UBIFS_FIRST_INO;
        c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
@@ -1937,18 +1976,19 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        err  = bdi_init(&c->bdi);
        if (err)
                goto out_close;
+        err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
+                           c->vi.ubi_num, c->vi.vol_id);
+        if (err)
+                goto out_bdi;
        err = ubifs_parse_options(c, data, 0);
        if (err)
                goto out_bdi;
-        c->vfs_sb = sb;
        sb->s_fs_info = c;
        sb->s_magic = UBIFS_SUPER_MAGIC;
        sb->s_blocksize = UBIFS_BLOCK_SIZE;
        sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
-        sb->s_dev = c->vi.cdev;
        sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
        if (c->max_inode_sz > MAX_LFS_FILESIZE)
                sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
@@ -1993,16 +2033,9 @@ out_free:
 static int sb_test(struct super_block *sb, void *data)
 {
        dev_t *dev = data;
+        struct ubifs_info *c = sb->s_fs_info;
-        return sb->s_dev == *dev;
+        return c->vi.cdev == *dev;
-}
-static int sb_set(struct super_block *sb, void *data)
-{
-        dev_t *dev = data;
-        sb->s_dev = *dev;
-        return 0;
 }
 static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
@@ -2030,7 +2063,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
-        sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
+        sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev);
        if (IS_ERR(sb)) {
                err = PTR_ERR(sb);
                goto out_close;
@@ -2070,16 +2103,11 @@ out_close:
        return err;
 }
-static void ubifs_kill_sb(struct super_block *sb)
-{
-        generic_shutdown_super(sb);
-}
 static struct file_system_type ubifs_fs_type = {
        .name    = "ubifs",
        .owner   = THIS_MODULE,
        .get_sb  = ubifs_get_sb,
-        .kill_sb = ubifs_kill_sb
+        .kill_sb = kill_anon_super,
 };
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0a8341e14088..a29349094422 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -95,8 +95,9 @@
 */
 #define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
-/* Default write-buffer synchronization timeout (5 secs) */
+/* Write-buffer synchronization timeout interval in seconds */
-#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
+#define WBUF_TIMEOUT_SOFTLIMIT 3
+#define WBUF_TIMEOUT_HARDLIMIT 5
 /* Maximum possible inode number (only 32-bit inodes are supported now) */
 #define MAX_INUM 0xFFFFFFFF
@@ -650,9 +651,12 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
 * @io_mutex: serializes write-buffer I/O
 * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
 *        fields
+ * @softlimit: soft write-buffer timeout interval
+ * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit
+ *         and @softlimit + @delta)
 * @timer: write-buffer timer
- * @timeout: timer expire interval in jiffies
+ * @no_timer: non-zero if this write-buffer does not have a timer
- * @need_sync: it is set if its timer expired and needs sync
+ * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
 * @next_ino: points to the next position of the following inode number
 * @inodes: stores the inode numbers of the nodes which are in wbuf
 *
@@ -678,9 +682,11 @@ struct ubifs_wbuf {
        int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
        struct mutex io_mutex;
        spinlock_t lock;
-        struct timer_list timer;
+        ktime_t softlimit;
-        int timeout;
+        unsigned long long delta;
-        int need_sync;
+        struct hrtimer timer;
+        unsigned int no_timer:1;
+        unsigned int need_sync:1;
        int next_ino;
        ino_t *inodes;
 };
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index cfd31e229c89..adafcf556531 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -55,9 +55,9 @@
 * ACL support is not implemented.
 */
+#include "ubifs.h"
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
-#include "ubifs.h"
 /*
 * Limit the number of extended attributes per inode so that the total size
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index e48e9a3af763..1e068535b58b 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -238,7 +238,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
        part_len = sbi->s_partmaps[partition].s_partition_len;
-        if (first_block < 0 || first_block >= part_len)
+        if (first_block >= part_len)
                goto out;
        if (first_block + block_count > part_len)
@@ -297,7 +297,7 @@ static int udf_bitmap_new_block(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
 repeat:
-        if (goal < 0 || goal >= sbi->s_partmaps[partition].s_partition_len)
+        if (goal >= sbi->s_partmaps[partition].s_partition_len)
                goal = 0;
        nr_groups = bitmap->s_nr_groups;
@@ -666,8 +666,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        int8_t etype = -1;
        struct udf_inode_info *iinfo;
-        if (first_block < 0 ||
+        if (first_block >= sbi->s_partmaps[partition].s_partition_len)
-                first_block >= sbi->s_partmaps[partition].s_partition_len)
                return 0;
        iinfo = UDF_I(table);
@@ -743,7 +742,7 @@ static int udf_table_new_block(struct super_block *sb,
                return newblock;
        mutex_lock(&sbi->s_alloc_mutex);
-        if (goal < 0 || goal >= sbi->s_partmaps[partition].s_partition_len)
+        if (goal >= sbi->s_partmaps[partition].s_partition_len)
                goal = 0;
        /* We search for the closest matching block to goal. If we find
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 703843f30ffd..1b88fd5df05d 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -56,7 +56,12 @@ unsigned long udf_get_last_block(struct super_block *sb)
        struct block_device *bdev = sb->s_bdev;
        unsigned long lblock = 0;
-        if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock))
+        /*
+         * ioctl failed or returned obviously bogus value?
+         * Try using the device size...
+         */
+        if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) ||
+            lblock == 0)
                lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        if (lblock)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6832135159b6..9d1b8c2e6c45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        struct udf_inode_info *vati;
        uint32_t pos;
        struct virtualAllocationTable20 *vat20;
+        sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        /* VAT file entry is in the last recorded block */
        ino.partitionReferenceNum = type1_index;
        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
        sbi->s_vat_inode = udf_iget(sb, &ino);
+        if (!sbi->s_vat_inode &&
+            sbi->s_last_block != blocks - 1) {
+                printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
+                       " last recorded block (%lu), retrying with the last "
+                       "block of the device (%lu).\n",
+                       (unsigned long)sbi->s_last_block,
+                       (unsigned long)blocks - 1);
+                ino.partitionReferenceNum = type1_index;
+                ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
+                sbi->s_vat_inode = udf_iget(sb, &ino);
+        }
        if (!sbi->s_vat_inode)
                return 1;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3d2512c21f05..7cf33379fd46 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -56,9 +56,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
        UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
-        if (i_block < 0) {
+        if (i_block < direct_blocks) {
-                ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
-        } else if (i_block < direct_blocks) {
                offsets[n++] = i_block;
        } else if ((i_block -= direct_blocks) < indirect_blocks) {
                offsets[n++] = UFS_IND_BLOCK;
@@ -440,8 +438,6 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
        lock_kernel();
        UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
-        if (fragment < 0)
-                goto abort_negative;
        if (fragment >
            ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
             << uspi->s_fpbshift))
@@ -504,10 +500,6 @@ abort:
        unlock_kernel();
        return err;
-abort_negative:
-        ufs_warning(sb, "ufs_get_block", "block < 0");
-        goto abort;
 abort_too_big:
        ufs_warning(sb, "ufs_get_block", "block > big");
        goto abort;
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 1cd3b55ee3d2..2d3f90afe5f1 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -53,7 +53,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, lflags);
-                congestion_wait(WRITE, HZ/50);
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
 }
@@ -130,7 +130,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, lflags);
-                congestion_wait(WRITE, HZ/50);
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 1e9d1246eebc..b23a54506446 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -25,14 +25,10 @@
 #include <linux/posix_acl_xattr.h>
-#define XFS_ACL_NOT_CACHED      ((void *)-1)
 /*
 * Locking scheme:
 *  - all ACL updates are protected by inode->i_mutex, which is taken before
 *    calling into this file.
- *  - access and updates to the ip->i_acl and ip->i_default_acl pointers are
- *    protected by inode->i_lock.
 */
 STATIC struct posix_acl *
@@ -102,59 +98,35 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
        }
 }
-/*
- * Update the cached ACL pointer in the inode.
- *
- * Because we don't hold any locks while reading/writing the attribute
- * from/to disk another thread could have raced and updated the cached
- * ACL value before us. In that case we release the previous cached value
- * and update it with our new value.
- */
-STATIC void
-xfs_update_cached_acl(struct inode *inode, struct posix_acl **p_acl,
-                struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*p_acl && *p_acl != XFS_ACL_NOT_CACHED)
-                posix_acl_release(*p_acl);
-        *p_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 struct posix_acl *
 xfs_get_acl(struct inode *inode, int type)
 {
        struct xfs_inode *ip = XFS_I(inode);
-        struct posix_acl *acl = NULL, **p_acl;
+        struct posix_acl *acl;
        struct xfs_acl *xfs_acl;
        int len = sizeof(struct xfs_acl);
        char *ea_name;
        int error;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                ea_name = SGI_ACL_FILE;
-                p_acl = &ip->i_acl;
                break;
        case ACL_TYPE_DEFAULT:
                ea_name = SGI_ACL_DEFAULT;
-                p_acl = &ip->i_default_acl;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
-        spin_lock(&inode->i_lock);
-        if (*p_acl != XFS_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*p_acl);
-        spin_unlock(&inode->i_lock);
        /*
         * If we have a cached ACLs value just return it, not need to
         * go out to the disk.
         */
-        if (acl)
-                return acl;
        xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
        if (!xfs_acl)
@@ -165,7 +137,7 @@ xfs_get_acl(struct inode *inode, int type)
                /*
                 * If the attribute doesn't exist make sure we have a negative
                 * cache entry, for any other error assume it is transient and
-                 * leave the cache entry as XFS_ACL_NOT_CACHED.
+                 * leave the cache entry as ACL_NOT_CACHED.
                 */
                if (error == -ENOATTR) {
                        acl = NULL;
@@ -179,7 +151,7 @@ xfs_get_acl(struct inode *inode, int type)
                goto out;
 out_update_cache:
-        xfs_update_cached_acl(inode, p_acl, acl);
+        set_cached_acl(inode, type, acl);
 out:
        kfree(xfs_acl);
        return acl;
@@ -189,7 +161,6 @@ STATIC int
 xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
        struct xfs_inode *ip = XFS_I(inode);
-        struct posix_acl **p_acl;
        char *ea_name;
        int error;
@@ -199,13 +170,11 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        switch (type) {
        case ACL_TYPE_ACCESS:
                ea_name = SGI_ACL_FILE;
-                p_acl = &ip->i_acl;
                break;
        case ACL_TYPE_DEFAULT:
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EACCES : 0;
                ea_name = SGI_ACL_DEFAULT;
-                p_acl = &ip->i_default_acl;
                break;
        default:
                return -EINVAL;
@@ -242,7 +211,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        }
        if (!error)
-                xfs_update_cached_acl(inode, p_acl, acl);
+                set_cached_acl(inode, type, acl);
        return error;
 }
@@ -384,30 +353,6 @@ xfs_acl_chmod(struct inode *inode)
        return error;
 }
-void
-xfs_inode_init_acls(struct xfs_inode *ip)
-{
-        /*
-         * No need for locking, inode is not live yet.
-         */
-        ip->i_acl = XFS_ACL_NOT_CACHED;
-        ip->i_default_acl = XFS_ACL_NOT_CACHED;
-}
-void
-xfs_inode_clear_acls(struct xfs_inode *ip)
-{
-        /*
-         * No need for locking here, the inode is not live anymore
-         * and just about to be freed.
-         */
-        if (ip->i_acl != XFS_ACL_NOT_CACHED)
-                posix_acl_release(ip->i_acl);
-        if (ip->i_default_acl != XFS_ACL_NOT_CACHED)
-                posix_acl_release(ip->i_default_acl);
-}
 /*
 * System xattr handlers.
 *
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 178c20c13e83..965df1227d64 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -412,7 +412,7 @@ _xfs_buf_lookup_pages(
                        XFS_STATS_INC(xb_page_retries);
                        xfsbufd_wakeup(0, gfp_mask);
-                        congestion_wait(WRITE, HZ/50);
+                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f4e255441574..0542fd507649 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -41,7 +41,6 @@
 #include "xfs_ioctl.h"
 #include <linux/dcache.h>
-#include <linux/smp_lock.h>
 static struct vm_operations_struct xfs_file_vm_ops;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index f65a53f8752f..6127e24062d0 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -24,7 +24,7 @@
 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
 * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
 */
-#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS 1
 # define XFS_BIG_INUMS  1
 #else
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2e09efbca8db..a220d36f789b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -616,7 +616,7 @@ xfs_max_file_offset(
         */
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
        ASSERT(sizeof(sector_t) == 8);
        pagefactor = PAGE_CACHE_SIZE;
        bitshift = BITS_PER_LONG;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 63dc1f2efad5..947b150df8ed 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -46,8 +46,6 @@ extern int xfs_check_acl(struct inode *inode, int mask);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
-extern void xfs_inode_init_acls(struct xfs_inode *ip);
-extern void xfs_inode_clear_acls(struct xfs_inode *ip);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
@@ -57,8 +55,6 @@ extern struct xattr_handler xfs_xattr_system_handler;
 # define xfs_get_acl(inode, type)                       NULL
 # define xfs_inherit_acl(inode, default_acl)            0
 # define xfs_acl_chmod(inode)                           0
-# define xfs_inode_init_acls(ip)
-# define xfs_inode_clear_acls(ip)
 # define posix_acl_access_exists(inode)                 0
 # define posix_acl_default_exists(inode)                0
 #endif /* CONFIG_XFS_POSIX_ACL */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 76c540f719e4..34ec86923f7e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -64,6 +64,10 @@ xfs_inode_alloc(
        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
        if (!ip)
                return NULL;
+        if (inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
        ASSERT(atomic_read(&ip->i_iocount) == 0);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -83,7 +87,6 @@ xfs_inode_alloc(
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
        ip->i_size = 0;
        ip->i_new_size = 0;
-        xfs_inode_init_acls(ip);
        /*
         * Initialize inode's trace buffers.
@@ -106,17 +109,6 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
-        /*
-        * Now initialise the VFS inode. We do this after the xfs_inode
-        * initialisation as internal failures will result in ->destroy_inode
-        * being called and that will pass down through the reclaim path and
-        * free the XFS inode. This path requires the XFS inode to already be
-        * initialised. Hence if this call fails, the xfs_inode has already
-        * been freed and we should not reference it at all in the error
-        * handling.
-        */
-        if (!inode_init_always(mp->m_super, VFS_I(ip)))
-                return NULL;
        /* prevent anyone from using this yet */
        VFS_I(ip)->i_state = I_NEW|I_LOCK;
@@ -124,6 +116,71 @@ xfs_inode_alloc(
        return ip;
 }
+STATIC void
+xfs_inode_free(
+        struct xfs_inode        *ip)
+{
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
+        }
+        if (ip->i_afp)
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+#ifdef XFS_INODE_TRACE
+        ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /*
+                 * Only if we are shutting down the fs will we see an
+                 * inode still in the AIL. If it is there, we should remove
+                 * it to prevent a use-after-free from occurring.
+                 */
+                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+                struct xfs_ail  *ailp = lip->li_ailp;
+                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                if (lip->li_flags & XFS_LI_IN_AIL) {
+                        spin_lock(&ailp->xa_lock);
+                        if (lip->li_flags & XFS_LI_IN_AIL)
+                                xfs_trans_ail_delete(ailp, lip);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                }
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(completion_done(&ip->i_flush));
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 /*
 * Check the validity of the inode we just found it the cache
 */
@@ -168,7 +225,7 @@ xfs_iget_cache_hit(
                 * errors cleanly, then tag it so it can be set up correctly
                 * later.
                 */
-                if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                if (inode_init_always(mp->m_super, VFS_I(ip))) {
                        error = ENOMEM;
                        goto out_error;
                }
@@ -300,7 +357,8 @@ out_preload_end:
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
 out_destroy:
-        xfs_destroy_inode(ip);
+        __destroy_inode(VFS_I(ip));
+        xfs_inode_free(ip);
        return error;
 }
@@ -505,63 +563,7 @@ xfs_ireclaim(
        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        switch (ip->i_d.di_mode & S_IFMT) {
+        xfs_inode_free(ip);
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
-        if (ip->i_itemp) {
-                /*
-                 * Only if we are shutting down the fs will we see an
-                 * inode still in the AIL. If it is there, we should remove
-                 * it to prevent a use-after-free from occurring.
-                 */
-                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-                struct xfs_ail  *ailp = lip->li_ailp;
-                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
-                if (lip->li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&ailp->xa_lock);
-                        if (lip->li_flags & XFS_LI_IN_AIL)
-                                xfs_trans_ail_delete(ailp, lip);
-                        else
-                                spin_unlock(&ailp->xa_lock);
-                }
-                xfs_inode_item_destroy(ip);
-                ip->i_itemp = NULL;
-        }
-        /* asserts to verify all state is correct here */
-        ASSERT(atomic_read(&ip->i_iocount) == 0);
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(completion_done(&ip->i_flush));
-        xfs_inode_clear_acls(ip);
-        kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 77016702938b..65f24a3cc992 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -273,11 +273,6 @@ typedef struct xfs_inode {
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
-#ifdef CONFIG_XFS_POSIX_ACL
-        struct posix_acl        *i_acl;
-        struct posix_acl        *i_default_acl;
-#endif
        /* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
        struct ktrace           *i_trace;       /* general inode trace */
@@ -315,23 +310,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 }
 /*
- * Get rid of a partially initialized inode.
- *
- * We have to go through destroy_inode to make sure allocations
- * from init_inode_always like the security data are undone.
- *
- * We mark the inode bad so that it takes the short cut in
- * the reclaim path instead of going through the flush path
- * which doesn't make sense for an inode that has never seen the
- * light of day.
- */
-static inline void xfs_destroy_inode(struct xfs_inode *ip)
-{
-        make_bad_inode(VFS_I(ip));
-        return destroy_inode(VFS_I(ip));
-}
-/*
 * i_flags helper functions
 */
 static inline void